# Imports

In [2]:
import re
import spacy
import pickle
import statistics
import joblib
import requests
import uuid
import json
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
from langdetect import detect
from sklearn.model_selection import StratifiedKFold, train_test_split, learning_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from nltk import download, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, RSLPStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import warnings

warnings.filterwarnings("ignore", category=UserWarning)


# Data Import

In [4]:
airbnb_df_train = pd.read_excel('./1.data_raw/train.xlsx')
airbnb_df_test = pd.read_excel('./1.data_raw/test.xlsx')
airbnb_df_train_reviews = pd.read_excel('./1.data_raw/train_reviews.xlsx')
airbnb_df_test_reviews = pd.read_excel('./1.data_raw/test_reviews.xlsx')
pd.options.display.max_colwidth = 50

# First Data Exploration

airbnb_df_train.shape

In [None]:
airbnb_df_train_reviews.shape

In [None]:
airbnb_df_train.loc[airbnb_df_train["host_about"]==":)"]

GULIAS CODE HERE!

# First Approach - TF - IDF with Multimodel for 3 main languages

## Language Detection

In [5]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'Unknown'

### Language detection for reviews

In [None]:
airbnb_df_train_reviews["lang_comments"] = airbnb_df_train_reviews["comments"].apply(detect_language)

In [None]:
airbnb_df_train_reviews

### Language detection for desc/host_about

In [None]:
airbnb_df_train["lang_desc"] = airbnb_df_train["description"].apply(detect_language)

In [None]:
airbnb_df_train["lang_host"] = airbnb_df_train["host_about"].apply(detect_language)

In [None]:
airbnb_df_train

## Import detected data

In [None]:
df_train_detected = pd.read_csv("./2.data_detected/airbnb_df_train_detected.csv", index_col="index",).drop("Unnamed: 0",axis=1)
df_train_reviews_detected = pd.read_csv("./2.data_detected/airbnb_df_train__reviews_detected.csv", index_col="index").drop("Unnamed: 0",axis=1)

## Count number of different Languages

In [None]:
df_train_detected["lang_desc"].value_counts()[0:5]

In [None]:
df_train_detected["lang_host"].value_counts()[0:5]

In [None]:
df_train_reviews_detected["lang_comments"].value_counts()[0:5]

In [None]:
def create_df(df, df_review, language):

    columns_to_drop = ['lang_desc', 'lang_host']
    
    df = df[(df['lang_desc'] == language) & (df['lang_host'] == language)]
    df_review = df_review[df_review['lang_comments'] == language]
    grouped_reviews = df_review.groupby('index')['comments'].apply(lambda x: ''.join(str(x))).reset_index()
    merged_df = pd.merge(df, grouped_reviews, on='index', how='left')
    merged_df = merged_df.drop(columns=columns_to_drop)
    merged_df = merged_df[["index",	"description", "host_about", "comments", "unlisted"]]

    return merged_df

## join all English host_about/desc with English comments

In [None]:
merged_df_english = create_df(df_train_detected, df_train_reviews_detected, "en")

## Preprocessing English

In [None]:
def preprocessing_eng(row, tokenize, stop, lemmatize, stemmertize):
    updates = []
    
    for j in tqdm(row):
        
        text = j
        
        #LOWERCASE TEXT
        text = str(text).lower()
        
        #REMOVE NUMERICAL DATA and PUNCTUATION
        text = re.sub('<[^>]+>', ' ', text)
        text = re.sub("[^a-zA-Z]"," ", text )

        

        if tokenize:
            tokens = word_tokenize(text)
            text = " ".join(tokens)
        
        #REMOVE STOPWORDS
        if stop:
            stop_eng = set(stopwords.words('english'))
            text = " ".join([word for word in text.split() if word not in stop_eng])
        
        #Lemmatize
        if lemmatize:
            lemma_eng = WordNetLemmatizer()
            text = " ".join(lemma_eng.lemmatize(word) for word in text.split())
            
        
        #Stemming
        if stemmertize:
            stemmer_eng = SnowballStemmer('english')
            text = " ".join(stemmer_eng.stem(word) for word in text.split())
            
        updates.append(text)
        
    return updates

In [None]:
download('wordnet')
download('stopwords')
download('punkt')
columns_to_apply = ['description', 'host_about', 'comments']
merged_df_english[columns_to_apply] = merged_df_english[columns_to_apply].astype(str).apply(lambda row: preprocessing_eng(row=row,
                                                                                                                        tokenize=True,
                                                                                                                        stop=True,
                                                                                                                        lemmatize = True, 
                                                                                                                        stemmertize = False
                                                                                                                        )
                                                                                            )

In [None]:
merged_df_english.to_csv("./3.data_train/merged_df_english.csv")

In [None]:
merged_df_english = pd.read_csv("./3.data_train/merged_df_english.csv").drop(["Unnamed: 0", "index"],axis=1)

In [None]:
merged_df_english

## join all French host_about/desc with French comments

In [None]:
merged_df_french = create_df(df_train_detected, df_train_reviews_detected, "fr")

In [None]:
merged_df_french

## Preprocessing French

In [6]:
#!pip3 -m spacy download fr_core_news_md

In [None]:
def preprocessing_fr(row, tokenize, stop, lemmatize, stemmertize):
    updates = []
    
    for j in tqdm(row):
        
        text = j
        
        #LOWERCASE TEXT
        text = str(text).lower()
        
        #REMOVE NUMERICAL DATA and PUNCTUATION
        text = re.sub("[^a-zA-Z]"," ", text )
        text = re.sub("br", "", text)

        if tokenize:
            tokens = word_tokenize(text, language="french")
            text = " ".join(tokens)
 
        #REMOVE STOPWORDS
        if stop:
            stop_fr = set(stopwords.words('french'))
            text = " ".join([word for word in text.split() if word not in stop_fr])
        

        #Lemmatize
        if lemmatize:
            lemma_fr = spacy.load("fr_core_news_md")
            doc = lemma_fr(text)
            #for word in doc:
            text = " ".join(i.lemma_ for i in doc)
        
        #Stemming
        if stemmertize:
            stemmer_fr = SnowballStemmer('french')
            stem_doc = stemmer_fr(text)
            for word in stem_doc:
                text = " ".join(stemmer_fr.stem(word) for word in text.split())
            
        updates.append(text)
        
    return updates

In [None]:
download('wordnet')
download('stopwords')
columns_to_apply = ['description', 'host_about', 'comments']
merged_df_french[columns_to_apply] = merged_df_french[columns_to_apply].astype(str).apply(lambda x: preprocessing_fr(row=x,
                                                                                                                     tokenize=True,
                                                                                                                     stop=True,
                                                                                                                     lemmatize = True, 
                                                                                                                     stemmertize = False
                                                                                                                    )
                                                                                         )

In [None]:
merged_df_french.to_csv("./3.data_train/merged_df_french.csv")

In [None]:
merged_df_french = pd.read_csv("./3.data_train/merged_df_french.csv").drop(["Unnamed: 0", "index"],axis=1)

In [None]:
merged_df_french

## join all Portuguese host_about/desc with Portuguese comments'

In [None]:
merged_df_pt = create_df(df_train_detected, df_train_reviews_detected, "pt")

In [None]:
merged_df_pt

## Preprocessing Portuguese

In [7]:
#!python -m spacy download pt_core_news_sm

In [None]:
def preprocessing_pt(row, tokenize, stop, lemmatize, stemmertize):
    updates = []
    
    for j in tqdm(row):
        
        text = j
        
        #LOWERCASE TEXT
        text = str(text).lower()
        
        #REMOVE NUMERICAL DATA and PUNCTUATION
        text = re.sub("[^a-zA-Z]"," ", text )
        text = re.sub("br", "", text)

        if tokenize:
            tokens = word_tokenize(text, language="portuguese")
            text = " ".join(tokens)
            
        #REMOVE STOPWORDS
        if stop:
            stop_pt = set(stopwords.words('portuguese'))
            text = " ".join([word for word in text.split() if word not in stop_pt])
            
        #Lemmatize
        if lemmatize:
            lemma_pt = spacy.load("pt_core_news_sm")
            doc = lemma_pt(text)
            text = " ".join(i.lemma_ for i in doc)
        
        #Stemming
        if stemmertize:
            stemmer_pt = RSLPStemmer()
            stem_doc = stemmer_pt(text)
            for word in stem_doc:
                text = " ".join(stemmer_pt.stem(word) for word in text.split())
            
        updates.append(text)
        
    return updates

In [None]:
merged_df_pt.head()

In [None]:
download('wordnet')
download('stopwords')
columns_to_apply = ['description', 'host_about', 'comments']
merged_df_pt[columns_to_apply] = merged_df_pt[columns_to_apply].astype(str).apply(lambda row: preprocessing_pt(row=row,
                                                                                                             tokenize=True,
                                                                                                             stop=True,
                                                                                                             lemmatize = True, 
                                                                                                             stemmertize = False
                                                                                                            )
                                                                                )

In [None]:
merged_df_pt.to_csv("./3.data_train/merged_df_pt.csv")

In [None]:
merged_df_pt = pd.read_csv("./3.data_train/merged_df_pt.csv").drop(["Unnamed: 0","index"],axis=1)

In [None]:
merged_df_pt

## TF - IDF, Train and Evaluation of Estimators

In [None]:
def create_target(df):
    # Concatenate text columns into a single column
    df['Concatenated_Text'] = df['description'] + ' ' + df['host_about'] + ' ' + df['comments']

    #separate features and taget
    X = df['Concatenated_Text'].astype(str)
    y = df['unlisted'].astype(int)

    return X, y

In [8]:
def create_mlp(input_dim, output_dim, embedding_dim, hidden_units):
    model = Sequential()
    model.add(Embedding(input_dim + 1, embedding_dim, input_length=input_dim, mask_zero=True))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(hidden_units, activation='relu'))
    model.add(Dense(output_dim, activation='sigmoid'))
    return model

In [9]:
def create_lstm(input_dim, output_dim, embedding_dim, hidden_units):
    model = Sequential()
    model.add(Embedding(input_dim + 1, embedding_dim, input_length=input_dim, mask_zero=True))
    model.add(LSTM(hidden_units))
    model.add(Dense(output_dim, activation='sigmoid'))
    return model


In [None]:
def custom_fit_evaluate(X,y,estimator):
    tfidf = TfidfVectorizer()
    precision_avg = []
    recall_avg = []
    f1_avg = []
    accuracy_avg = []

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    for train_index, test_index in skf.split(X, y):
        x_train, x_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        
        x_train_tfidf = tfidf.fit_transform(x_train)
        x_val_tfidf = tfidf.transform(x_val)

        estimator.fit(x_train_tfidf,y_train)
        
        y_pred = estimator.predict(x_val_tfidf)

        precision = precision_score(y_val, y_pred)
        recall = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        accuracy = accuracy_score(y_val, y_pred)


        precision_avg.append(precision)
        recall_avg.append(recall)
        f1_avg.append(f1)
        accuracy_avg.append(accuracy)
    
    precision_avg = statistics.mean(precision_avg)
    recall_avg = statistics.mean(recall_avg)
    f1_avg = statistics.mean(f1_avg)
    accuracy_avg = statistics.mean(accuracy_avg)

    classes = np.unique(np.concatenate((y_val, y_pred)))

    # Create a confusion matrix
    cm = confusion_matrix(y_val, y_pred)

    # Plot the confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", cbar=False)
    plt.xlabel("Predicted labels")
    plt.ylabel("True labels")
    plt.title("Confusion Matrix")
    plt.xticks(np.arange(len(classes)) + 0.5, classes)
    plt.yticks(np.arange(len(classes)) + 0.5, classes)
    plt.show()

    return precision_avg, recall_avg, f1_avg, accuracy_avg

## English

In [None]:
X_eng, y_eng = create_target(merged_df_english)

In [None]:
rf_classifier_eng = RandomForestClassifier()

In [None]:
max_length = max(len(comment) for comment in X_eng)
padded_comments = pad_sequences(X_eng, maxlen=max_length, padding='post')

In [None]:
mlp_eng = create_mlp(input_dim=max_length, output_dim=1, embedding_dim=50, hidden_units=64)
mlp_eng.summary()
mlp_eng.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])

In [None]:
lstm_eng = create_lstm(input_dim=max_length, output_dim=1, embedding_dim=50, hidden_units=64)
lstm_eng.summary()
lstm_eng.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])

In [None]:
custom_fit_evaluate(X_eng, y_eng, rf_classifier_eng)

In [None]:
custom_fit_evaluate(X_eng, y_eng, mlp_eng)

In [None]:
custom_fit_evaluate(X_eng, y_eng, lstm_eng)

In [None]:
tfidf_eng = TfidfVectorizer()
x_train_tfidf_eng = tfidf_eng.fit_transform(X_eng)

In [None]:
rf_classifier_eng.fit(x_train_tfidf_eng, y_eng)

In [None]:
mlp_eng.fit(x_train_tfidf_eng, y_eng)

In [None]:
lstm_eng.fit(x_train_tfidf_eng, y_eng)

In [None]:
with open('./5.Vectorizer/tfidf_vectorizer_eng.pkl', 'wb') as file:
    pickle.dump(tfidf_eng, file)

In [None]:

joblib.dump(rf_classifier_eng, "./4.models/rf_classifier_eng.joblib")
joblib.dump(mlp_eng, "./4.models/mlp_eng.joblib")
joblib.dump(lstm_eng, "./4.models/lstm_eng.joblib")

## French

In [None]:
X_fr, y_fr = create_target(merged_df_french)

In [None]:
rf_classifier_fr = RandomForestClassifier()

In [None]:
max_length = max(len(comment) for comment in X_fr)
padded_comments = pad_sequences(X_fr, maxlen=max_length, padding='post')

In [None]:
mlp_fr = create_mlp(input_dim=max_length, output_dim=1, embedding_dim=50, hidden_units=64)
mlp_fr.summary()
mlp_fr.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])

In [None]:
lstm_fr = create_lstm(input_dim=max_length, output_dim=1, embedding_dim=50, hidden_units=64)
lstm_fr.summary()
lstm_fr.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])

In [None]:
custom_fit_evaluate(X_fr, y_fr, rf_classifier_fr)

In [None]:
custom_fit_evaluate(X_fr, y_fr, mlp_fr)

In [None]:
custom_fit_evaluate(X_fr, y_fr, lstm_fr)

In [None]:
tfidf_fr = TfidfVectorizer()
x_train_tfidf_fr = tfidf_fr.fit_transform(X_fr)

In [None]:
rf_classifier_fr.fit(x_train_tfidf_fr, y_fr)

In [None]:
mlp_fr.fit(x_train_tfidf_fr, y_fr)

In [None]:
lstm_fr.fit(x_train_tfidf_fr, y_fr)

In [None]:
with open('./5.Vectorizer/tfidf_vectorizer_fr.pkl', 'wb') as file:
    pickle.dump(tfidf_fr, file)

In [None]:
joblib.dump(rf_classifier_fr, "./4.models/rf_classifier_fr.joblib")
joblib.dump(mlp_fr, "./4.models/mlp_fr.joblib")
joblib.dump(lstm_fr, "./4.models/lstm_fr.joblib")

## Portuguese

In [None]:
X_pt, y_pt = create_target(merged_df_pt)

In [None]:
rf_classifier_pt = RandomForestClassifier()

In [None]:
max_length = max(len(comment) for comment in X_pt)
padded_comments = pad_sequences(X_pt, maxlen=max_length, padding='post')

In [None]:
mlp_pt = create_mlp(input_dim=max_length, output_dim=1, embedding_dim=50, hidden_units=64)
mlp_pt.summary()
mlp_pt.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])

In [None]:
lstm_pt = create_lstm(input_dim=max_length, output_dim=1, embedding_dim=50, hidden_units=64)
lstm_pt.summary()
lstm_pt.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])

In [None]:
custom_fit_evaluate(X_pt, y_pt, rf_classifier_pt)

In [None]:
custom_fit_evaluate(X_pt, y_pt, mlp_pt)

In [None]:
custom_fit_evaluate(X_pt, y_pt, lstm_pt)

In [None]:
tfidf_pt = TfidfVectorizer()
x_train_tfidf_pt = tfidf_pt.fit_transform(X_pt)

In [None]:
rf_classifier_pt.fit(x_train_tfidf_pt, y_pt)

In [None]:
mlp_pt.fit(x_train_tfidf_pt, y_pt)

In [None]:
lstm_pt.fit(x_train_tfidf_pt, y_pt)

In [None]:
with open('./5.Vectorizer/tfidf_vectorizer_pt.pkl', 'wb') as file:
    pickle.dump(tfidf_pt, file)

In [None]:
joblib.dump(rf_classifier_pt, "./4.models/rf_classifier_pt.joblib")
joblib.dump(mlp_pt, "./4.models/mlp_pt.joblib")
joblib.dump(lstm_pt, "./4.models/lstm_pt.joblib")

## Prediction

In [None]:
def translate_to_eng(text):
    key = "4aacdafc18474eb0accce6d24349ac62"
    endpoint = "https://api.cognitive.microsofttranslator.com"

    location = "westeurope"

    path = '/translate'
    constructed_url = endpoint + path

    params = {
        'api-version': '3.0',
        'to': 'en'
    }

    headers = {
        'Ocp-Apim-Subscription-Key': key,
        'Ocp-Apim-Subscription-Region': location,
        'Content-type': 'application/json',
        'X-ClientTraceId': str(uuid.uuid4())
    }

    body = [{
        'text': text
    }]

    request = requests.post(constructed_url, params=params, headers=headers, json=body)
    response = request.json()

    return response[0]["translations"][0]["text"]

In [None]:
def create_feature_row(row):
    description = row[1]
    host_about = row[2]
    comments = row[3]

    # Concatenate text columns into a single column
    concatenated_text = str(description) + ' ' + str(host_about) + ' ' + str(comments)

    return concatenated_text

In [None]:
def merge_test_df(df, df_review):
    grouped_reviews = df_review.groupby('index')['comments'].apply(lambda x: ''.join(str(x))).reset_index()
    merged_df = pd.merge(df, grouped_reviews, on='index', how='left')
    merged_df = merged_df[["index",	"description", "host_about", "comments"]]

    return merged_df

In [None]:
merged_test_df = merge_test_df(airbnb_df_test, airbnb_df_test_reviews)

In [None]:
merged_test_df

In [None]:
def prediction(row):
    columns_to_apply = ['description', 'host_about', 'comments']
    lang_desc = detect_language(str(row[1]))
    lang_host = detect_language(str(row[2]))
    lang_review = detect_language(str(row[3]))

    if (lang_desc == "en") & (lang_host == "en") & (lang_review == "en"):
        row = preprocessing_eng(row=row,
                                tokenize=True,
                                stop=True,
                                lemmatize = True, 
                                stemmertize = False
                                )

        row = create_feature_row(row)

        with open('./5.Vectorizer/tfidf_vectorizer_eng.pkl', 'rb') as file:
            tfidf_vectorizer_eng = pickle.load(file)

        row_list = [row]
        row = tfidf_vectorizer_eng.transform(row_list)

        loaded_rf_eng = joblib.load("./4.models/rf_classifier_eng.joblib")

        return int(loaded_rf_eng.predict(row)[0])

    elif (lang_desc == "fr") & (lang_host == "fr") & (lang_review == "fr"):
        row = preprocessing_fr(row=row,
                                tokenize=True,
                                stop=True,
                                lemmatize = True, 
                                stemmertize = False
                                )
                                
        row = create_feature_row(row)

        with open('./5.Vectorizer/tfidf_vectorizer_fr.pkl', 'rb') as file:
            tfidf_vectorizer_fr = pickle.load(file)

        row_list = [row]
        row = tfidf_vectorizer_fr.transform(row_list)

        loaded_rf_fr = joblib.load("./4.models/rf_classifier_fr.joblib")

        return int(loaded_rf_fr.predict(row)[0])

    elif (lang_desc == "pt") & (lang_host == "pt") & (lang_review == "pt"):
        row = preprocessing_pt(row=row,
                                tokenize=True,
                                stop=True,
                                lemmatize = True, 
                                stemmertize = False
                                )

        row = create_feature_row(row)

        with open('./5.Vectorizer/tfidf_vectorizer_pt.pkl', 'rb') as file:
            tfidf_vectorizer_pt = pickle.load(file)

        row_list = [row]
        row = tfidf_vectorizer_pt.transform(row_list)

        loaded_rf_pt = joblib.load("./4.models/rf_classifier_pt.joblib")

        return int(loaded_rf_pt.predict(row)[0])
    else:
        loaded_rf_eng = joblib.load("./4.models/rf_classifier_eng.joblib")
        row = create_feature_row(row)
        row = translate_to_eng(row)
        row = preprocessing_eng(row=row,
                                tokenize=True,
                                stop=True,
                                lemmatize = True, 
                                stemmertize = False
                                )
                                
        with open('./5.Vectorizer/tfidf_vectorizer_eng.pkl', 'rb') as file:
            tfidf_vectorizer_eng = pickle.load(file)

        row = tfidf_vectorizer_eng.transform(row)

        return int(loaded_rf_eng.predict(row)[0])

In [None]:
merged_test_df["prediction"] = merged_test_df.apply(lambda row: prediction(row), axis=1)

In [None]:
merged_test_df["prediction"].value_counts(dropna=False)

In [None]:
merged_test_df

In [None]:
merged_test_df.to_csv("./6.Predictions/prediction_approach_1.csv")

# Second Approach 2.1 - Glove Embedding with subsample of data (reaching resource limits)

In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub('<[^>]+>', ' ', text)
    text = re.sub("[^a-zA-Z]", " ", text)

    return text

## Load previous detected and stored Dataset

In [None]:
df_train_reviews = pd.read_csv("/content/drive/MyDrive/Uni/Text Mining/2.data_detected/airbnb_df_train__reviews_detected.csv").drop("Unnamed: 0", axis=1)

In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/Uni/Text Mining/2.data_detected/airbnb_df_train_detected.csv").drop("Unnamed: 0", axis=1)

## Merge the two train Dataframes

In [None]:
df_merged = pd.merge(df_train,df_train_reviews, on="index", how="left")

In [None]:
def prepare_df(df):
    grouped_df = df.groupby('index').agg({
        'description': 'first',
        'host_about': 'first',
        'comments': lambda x: ' '.join(x.astype(str).values),
        'lang_comments': lambda x: ' '.join(x.astype(str).values),
        "unlisted" : "first"
    }).reset_index()

    return grouped_df

## Grouping and Preprocessing of the Dataframes

In [None]:
grouped_df = prepare_df(df_merged)

In [None]:
columns_to_clean = ['description', 'host_about', 'comments']

grouped_df[columns_to_clean] = grouped_df[columns_to_clean].applymap(clean_text)

In [None]:
# Load the DataFrame
df = grouped_df.sample(800, random_state=1)

# Filter out rows with missing values in necessary columns
df = df.dropna(subset=['description', 'host_about', 'comments'])

# Extract the necessary columns
text_data = df['description'] + ' ' + df['host_about'] + ' ' + df['comments']
labels = df['unlisted']

# Split the data into train and test sets
text_train, text_test, labels_train, labels_test = train_test_split(text_data, labels, test_size=0.2, random_state=42)

# Tokenize the text and convert it into sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_train)

vocab_size = len(tokenizer.word_index) + 1
max_sequence_length = max([len(sentence.split()) for sentence in text_train])

sequences_train = tokenizer.texts_to_sequences(text_train)
sequences_test = tokenizer.texts_to_sequences(text_test)

In [None]:
# Pad sequences to have the same length
padded_train = pad_sequences(sequences_train, maxlen=max_sequence_length, padding='post')
padded_test = pad_sequences(sequences_test, maxlen=max_sequence_length, padding='post')

## Load Glove Word Embeddings

In [None]:
# Load pre-trained GloVe word embeddings
embeddings_index = {}
with open('/content/drive/MyDrive/glove.6B/glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
# Create embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
embedded_train = np.array([embedding_matrix[word] for word in padded_train])
embedded_test = np.array([embedding_matrix[word] for word in padded_test])

In [None]:
embedded_train_flat = embedded_train.reshape(embedded_train.shape[0], -1)
embedded_test_flat = embedded_test.reshape(embedded_test.shape[0], -1)

## Fit and Evaluate the Models

In [13]:
def custom_fit_evaluate(classifier, X_train, y_train, X_val, y_val):
    # Train the Random Forest classifier
    classifier = RandomForestClassifier(random_state=42)
    classifier.fit(X_train, y_train)

    # Predict on the test set
    predictions = classifier.predict(X_val)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_val, predictions)
    precision = precision_score(y_val, predictions)
    recall = recall_score(y_val, predictions)
    f1 = f1_score(y_val, predictions)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

    return predictions


In [None]:
rf_classifier_2_1 = RandomForestClassifier(random_state=42)

In [None]:
rf_classifier_2_1_pred = custom_fit_evaluate(rf_classifier_2_1, embedded_train_flat, labels_train, embedded_test_flat, labels_test)

In [None]:
mlp_2_1 = create_mlp(input_dim=max_sequence_length, output_dim=1, embedding_dim=50, hidden_units=64)
mlp_2_1.summary()
mlp_2_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])

In [None]:
mlp_2_1_pred = custom_fit_evaluate(rf_classifier_2_1, embedded_train_flat, labels_train, embedded_test_flat, labels_test)

In [None]:
lstm_2_1 = create_lstm(input_dim=max_sequence_length, output_dim=1, embedding_dim=50, hidden_units=64)
lstm_2_1.summary()
lstm_2_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])

In [None]:
lstm_2_1_pred = custom_fit_evaluate(rf_classifier_2_1, embedded_train_flat, labels_train, embedded_test_flat, labels_test)

In [None]:
def plot_confusion_matrix(labels, predictions):
    classes = np.unique(np.concatenate((labels, predictions)))

    # Create a confusion matrix
    cm = confusion_matrix(labels, predictions)

    # Plot the confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", cbar=False)
    plt.xlabel("Predicted labels")
    plt.ylabel("True labels")
    plt.title("Confusion Matrix")
    plt.xticks(np.arange(len(classes)) + 0.5, classes)
    plt.yticks(np.arange(len(classes)) + 0.5, classes)
    plt.show()

In [None]:
plot_confusion_matrix(labels_test, rf_classifier_2_1_pred)

In [None]:
plot_confusion_matrix(labels_test, mlp_2_1_pred)

In [None]:
plot_confusion_matrix(labels_test, lstm_2_1_pred)

In [None]:
del embedded_train_flat
del embedded_test_flat

## Loading Test Dataframe and preparing them for the Prediction

In [None]:
df_test = pd.read_excel("/content/drive/MyDrive/Uni/Text Mining/test.xlsx")
df_test_reviews = pd.read_excel("/content/drive/MyDrive/Uni/Text Mining/test_reviews.xlsx")

In [None]:
test_df = pd.merge(df_test,df_test_reviews, on="index", how="left")

In [None]:
    test_df = test_df.groupby('index').agg({
        'description': 'first',
        'host_about': 'first',
        'comments': lambda x: ' '.join(x.astype(str).values),
    }).reset_index()

## Prediction

In [None]:
def predict_row(classifier, row):
    # Clean the text in the necessary columns
    cleaned_text = row[["description", "host_about", "comments"]].apply(clean_text)

    # Combine the necessary columns into a single text
    text_data = cleaned_text['description'] + ' ' + cleaned_text['host_about'] + ' ' + cleaned_text['comments']

    # Tokenize and convert the text data into sequences
    sequences = tokenizer.texts_to_sequences([text_data])

    # Pad the sequences to have the same length as the training data
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

    # Create word embedding feature vectors
    embedded_data = np.array([embedding_matrix[word] for word in padded_sequences])
    embedded_data_flat = embedded_data.reshape(embedded_data.shape[0], -1)

    # Predict on the data
    prediction = classifier.predict(embedded_data_flat)

    return prediction[0]  # Assuming you want to return a single prediction

In [None]:
test_df['predictions'] = test_df.apply(lambda row: predict_row(rf_classifier_2_1, row), axis=1)

In [None]:
test_df['predictions'].value_counts()

In [None]:
test_df.to_csv("./6.Predictions/prediction_approach_2_1.csv")

# Second Approach 2.2 - Glove Embedding batch wise to avoid resource limitations

## Load previous detected and stored Dataset

In [None]:
df_train_reviews = pd.read_csv("/content/drive/MyDrive/Uni/Text Mining/2.data_detected/airbnb_df_train__reviews_detected.csv").drop("Unnamed: 0", axis=1)

In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/Uni/Text Mining/2.data_detected/airbnb_df_train_detected.csv").drop("Unnamed: 0", axis=1)

## Merge the two Dataframes

In [None]:
df_merged = pd.merge(df_train,df_train_reviews, on="index", how="left")

In [None]:
grouped_df = prepare_df(df_merged)

In [None]:
columns_to_clean = ['description', 'host_about', 'comments']

grouped_df[columns_to_clean] = grouped_df[columns_to_clean].applymap(clean_text)

## Preprocess the data

In [None]:
# Load the DataFrame
df = grouped_df

# Filter out rows with missing values in necessary columns
df = df.dropna(subset=['description', 'host_about', 'comments'])

# Extract the necessary columns
text_data = df['description'] + ' ' + df['host_about'] + ' ' + df['comments']
labels = df['unlisted']

# Split the data into train and test sets
text_train, text_test, labels_train, labels_test = train_test_split(text_data, labels, test_size=0.2, random_state=42)

# Tokenize the text and convert it into sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_train)

vocab_size = len(tokenizer.word_index) + 1
max_sequence_length = max([len(sentence.split()) for sentence in text_train])

sequences_train = tokenizer.texts_to_sequences(text_train)
sequences_test = tokenizer.texts_to_sequences(text_test)

## Load Glove word embedding

In [None]:
# Load pre-trained GloVe word embeddings
embeddings_index = {}
with open('/content/drive/MyDrive/glove.6B/glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector

In [None]:
batch_size = 100

## Fit and Evaluate the Models

In [None]:
rf_classifier_2_2 = RandomForestClassifier(random_state=42, n_estimators=300, min_samples_split=32, min_samples_leaf=16, max_depth=4, warm_start=True)

In [None]:
mlp_2_2 = create_mlp(input_dim=max_sequence_length, output_dim=1, embedding_dim=50, hidden_units=64)
mlp_2_2.summary()
mlp_2_2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])

In [None]:
lstm_2_2 = create_lstm(input_dim=max_sequence_length, output_dim=1, embedding_dim=50, hidden_units=64)
lstm_2_2.summary()
lstm_2_2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])

In [None]:

def fit_model(classifier, sequences_train, labels_train, batch_size):
    # Get the total number of samples
    total_samples = len(sequences_train)


    # Iterate over the data in batches
    for i in range(0, total_samples, batch_size):
        # Get the current batch
        batch_start = i
        batch_end = min(i + batch_size, total_samples)
        batch_sequences_train = sequences_train[batch_start:batch_end]

        # Pad sequences to have the same length
        padded_train = pad_sequences(batch_sequences_train, maxlen=max_sequence_length, padding='post')
        # Create word embedding feature vectors
        embedded_train = np.array([embedding_matrix[word] for word in padded_train])
        embedded_train_flat = embedded_train.reshape(embedded_train.shape[0], -1)

        classifier.fit(embedded_train_flat, labels_train[batch_start:batch_end])

In [None]:
fit_model(rf_classifier_2_2, sequences_train, labels_train, batch_size)

In [None]:
fit_model(mlp_2_2, sequences_train, labels_train, batch_size)

In [None]:
fit_model(lstm_2_2, sequences_train, labels_train, batch_size)

In [None]:
def evaluate_model(classifier, sequences_test, labels_test, batch_size):
    total_samples = len(sequences_test)

    # Initialize an empty array to store the predictions
    predictions = np.array([])

    # Iterate over the data in batches
    for i in range(0, total_samples, batch_size):
        # Get the current batch
        batch_start = i
        batch_end = min(i + batch_size, total_samples)
        batch_sequences_test = sequences_test[batch_start:batch_end]
        padded_test = pad_sequences(batch_sequences_test, maxlen=max_sequence_length, padding='post')
        embedded_test = np.array([embedding_matrix[word] for word in padded_test])
        embedded_test_flat = embedded_test.reshape(embedded_test.shape[0], -1)
        batch_predictions = classifier.predict(embedded_test_flat)

        predictions = np.concatenate((predictions, batch_predictions))

    accuracy = accuracy_score(labels_test, predictions)
    precision = precision_score(labels_test, predictions)
    recall = recall_score(labels_test, predictions)
    f1 = f1_score(labels_test, predictions)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

    return predictions

In [None]:
rf_classifier_2_2_pred = evaluate_model(rf_classifier_2_2, sequences_test, labels_test, batch_size)

In [None]:
mlp_2_2_pred = evaluate_model(mlp_2_2, sequences_test, labels_test, batch_size)

In [None]:
lstm_2_2_pred = evaluate_model(lstm_2_2, sequences_test, labels_test, batch_size)

In [None]:
plot_confusion_matrix(labels_test, rf_classifier_2_2_pred)

In [None]:
plot_confusion_matrix(labels_test, mlp_2_2_pred)

In [None]:
plot_confusion_matrix(labels_test, lstm_2_2_pred)

## Loading Test Dataframe and preparing for Prediction

In [None]:
df_test = pd.read_excel("/content/drive/MyDrive/Uni/Text Mining/test.xlsx")
df_test_reviews = pd.read_excel("/content/drive/MyDrive/Uni/Text Mining/test_reviews.xlsx")

In [None]:
test_df = pd.merge(df_test,df_test_reviews, on="index", how="left")

In [None]:
    test_df = test_df.groupby('index').agg({
        'description': 'first',
        'host_about': 'first',
        'comments': lambda x: ' '.join(x.astype(str).values),
    }).reset_index()

## Prediction

In [None]:
def predict_row(classifier, row):
    # Clean the text in the necessary columns
    cleaned_text = row[["description", "host_about", "comments"]].apply(clean_text)

    # Combine the necessary columns into a single text
    text_data = cleaned_text['description'] + ' ' + cleaned_text['host_about'] + ' ' + cleaned_text['comments']

    # Tokenize and convert the text data into sequences
    sequences = tokenizer.texts_to_sequences([text_data])

    # Pad the sequences to have the same length as the training data
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

    # Create word embedding feature vectors
    embedded_data = np.array([embedding_matrix[word] for word in padded_sequences])
    embedded_data_flat = embedded_data.reshape(embedded_data.shape[0], -1)

    # Predict on the data
    prediction = classifier.predict(embedded_data_flat)

    return prediction[0]  # Assuming you want to return a single prediction

In [None]:
test_df['predictions'] = test_df.apply(lambda row: predict_row(rf_classifier_2_2, row), axis=1)

In [None]:
test_df['predictions'].value_counts()

# Third Approach - Transformer based Embedding