In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

In [None]:

dtFlorence=pd.read_csv("datasetsClean/florence_cleaned.csv")
#dtFlorence.drop(['latitude', 'longitude'], axis=1, inplace=True)
dtFlorence.columns

## VARIE FUNZIONI DI PULIZIA DEL DATASET

In [None]:
#tolgo le colonne con caratteristiche non strutturali e quelle d'intralcio per la clusterizzazione
def getDtCaratterisitcheStrutturali(dataset):
       datasetStrutturale = dataset.drop(['Unnamed: 0', 'amenities', 'minimum_nights', 'maximum_nights',
              'minimum_minimum_nights', 'maximum_minimum_nights',
              'minimum_maximum_nights', 'maximum_maximum_nights',
              'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'number_of_reviews',
              'number_of_reviews_ltm', 'number_of_reviews_l30d',
              'review_scores_rating', 'review_scores_accuracy',
              'review_scores_cleanliness', 'review_scores_checkin',
              'review_scores_communication', 'review_scores_location',
              'review_scores_value', 'license', 'latitude', 'longitude', 'price'], axis=1)
       return datasetStrutturale

def oneHotEncoding_neighbourhood_cleansed(df):
    # Applica l'encoding one-hot alla colonna neighbourhood_cleansed
    encoder = OneHotEncoder(sparse=False)
    neighbourhood_cleansed_encoded = encoder.fit_transform(df[['neighbourhood_cleansed']])

    # Converte l'array risultante in un DataFrame
    neighbourhood_cleansed_encoded_df = pd.DataFrame(neighbourhood_cleansed_encoded, columns=encoder.get_feature_names_out(['neighbourhood_cleansed']))

    # Concatena il DataFrame codificato con il DataFrame originale, eliminando la colonna neighbourhood_cleansed
    df_encoded = pd.concat([df.drop(columns=['neighbourhood_cleansed']), neighbourhood_cleansed_encoded_df], axis=1)
    return df_encoded

def createLabel(df):
    # Funzione per etichettare le righe uguali
    def label_rows(row):
        return '_'.join([str(row[col]) for col in df.columns])
    
    # Creazione della nuova colonna 'label'
    df['label'] = df.apply(label_rows, axis=1)

    # Mapping degli indici univoci ai valori della colonna 'label'
    label_map = {label: idx for idx, label in enumerate(df['label'].unique())}

    # Assegnazione dei valori della label
    df['label'] = df['label'].map(label_map)
    df['id'] = df.reset_index().index
    dtFlorence['id'] = dtFlorence.reset_index().index
    return df

def mergeColumns(df, origin_df):
       add_columns = ['id', 'amenities', 'minimum_nights', 'maximum_nights',
              'minimum_minimum_nights', 'maximum_minimum_nights',
              'minimum_maximum_nights', 'maximum_maximum_nights',
              'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'number_of_reviews',
              'number_of_reviews_ltm', 'number_of_reviews_l30d',
              'review_scores_rating', 'review_scores_accuracy',
              'review_scores_cleanliness', 'review_scores_checkin',
              'review_scores_communication', 'review_scores_location',
              'review_scores_value', 'license', 'price']
       # Supponendo che 'id' sia la colonna chiave comune tra i due DataFrame
       df_merged = df.merge(origin_df[add_columns], on='id', how='left')
       return df_merged

def getCaseTempo0(df):
    grouped = df.groupby('label')

    # Creo una maschera booleana che seleziona tutte le righe con il valore minimo nella colonna 'number_of_reviews' per ciascun gruppo
    min_reviews_mask = df['number_of_reviews'] == grouped['number_of_reviews'].transform('min')

    rows_with_min_reviews = df[min_reviews_mask]

    return rows_with_min_reviews

#modifica per altri due dataset
def createAnalysisDf(carattStrutt, dfMerged):
    new_columns = []
    for col in dfMerged:
        if col in carattStrutt:
            new_columns.append(col)
        else:
            col_0 = col + '_0'
            col_1 = col + '_1'
            new_columns.append(col_0)
            new_columns.append(col_1)

    analysisDf = pd.DataFrame(columns=new_columns)
    return analysisDf

def createArrayCarattStrutt(dfStrutturale):
    # salvo in un array tutte le caratteristiche strutturali, poi tutte quelle non strutturali al tempo 0
    carattStrutt = dfStrutturale.columns
    carattStrutt = list(carattStrutt)
    carattStrutt.remove('id')
    return carattStrutt

def createArrayCarattNonStrutt(df):
    carattNonStrutt_0 = [col for col in df.columns if col.endswith('_0')]
    # Creazione dell'array contenente i nomi delle colonne che terminano con _1
    carattNonStrutt_1 = [col for col in df.columns if col.endswith('_1')]
    return (carattNonStrutt_0, carattNonStrutt_1)

def populateAnalysisDf(df_t0, df_labeled, carattStrutt, carattNonStrutt_0, carattNonStrutt_1, analysisDf):
    rows_to_add = []
    
    for _, row in df_t0.iterrows():
        label = row['label']
        min_reviews = row['number_of_reviews']
        id_value = row['id']
        
        filtered_rows = df_labeled.loc[df_labeled['label'] == label]
        
        for _, row_filtered in filtered_rows.iterrows():
            if row_filtered['id'] != id_value and row_filtered['number_of_reviews'] != min_reviews:
                new_row = {}
                for column in analysisDf.columns:
                    if column in carattStrutt:
                        new_row[column] = row[column]
                    elif column in carattNonStrutt_0:
                        column_without_suffix = column[:-2]
                        new_row[column] = row[column_without_suffix]
                    elif column in carattNonStrutt_1:
                        column_without_suffix = column[:-2]
                        new_row[column] = row_filtered[column_without_suffix]
                
                rows_to_add.append(new_row)

    analysisDf = analysisDf.append(rows_to_add, ignore_index=True)
    return analysisDf

# FUNZIONE DI POPOLAMENTO DATASET ALTERNATIVA CHE BILANCIA LE CASE AL T_0 E AL T_1 
def populateAnalysisDf2(df_labeled):
    listOfDt = []
    for i in range(0,max(df_labeled['label']),1):
        t_0=df_labeled.loc[(df_labeled['label']==i) & (df_labeled['number_of_reviews']==min(df_labeled.loc[df_labeled['label']==i]['number_of_reviews']))]
        t_1=df_labeled.loc[(df_labeled['label']==i) & (df_labeled['number_of_reviews']!=min(df_labeled.loc[df_labeled['label']==i]['number_of_reviews']))]
        
        if(len(t_0)>len(t_1)):
            nrRow = len(t_1)
            t_0 = t_0.head(nrRow)
        else:
            nrRow = len(t_0)
            t_1 = t_1.head(nrRow)

        t_0['index'] = range(len(t_0))
        t_1['index'] = range(len(t_1))
        T_0_1 = pd.merge(t_0,t_1,how='outer',on='index')
        listOfDt.append(T_0_1)
    
    return pd.concat(listOfDt,ignore_index=True)
    


In [None]:
dtFlorenceStrutturale = getDtCaratterisitcheStrutturali(dtFlorence)
dtFlorenceStrutturale = oneHotEncoding_neighbourhood_cleansed(dtFlorenceStrutturale)
caratteristicheStrutturali = dtFlorenceStrutturale.columns.to_list()
dtFlorenceStrutturale = createLabel(dtFlorenceStrutturale)
dtFlorenceLabeled = mergeColumns(dtFlorenceStrutturale, dtFlorence)

In [None]:
rows_with_min_reviews = getCaseTempo0(dtFlorenceLabeled)
carattStrutt = createArrayCarattStrutt(dtFlorenceStrutturale)
dtFlorenceAnalysis = createAnalysisDf(carattStrutt, dtFlorenceLabeled)
carattNonStrutt_0, carattNonStrutt_1 = createArrayCarattNonStrutt(dtFlorenceAnalysis)
dtFlorenceAnalysis = populateAnalysisDf(rows_with_min_reviews, dtFlorenceLabeled, carattStrutt, carattNonStrutt_0, carattNonStrutt_1, dtFlorenceAnalysis)


In [None]:
dtFlorenceAnalysis = populateAnalysisDf2(dtFlorenceLabeled)

In [None]:
dtFlorenceAnalysis.to_csv("./datasetsAnalysis/florence_analysis.csv")

In [None]:
def contaValoriUnici(dataset,colonna):
    for i in dataset[colonna].unique():
        print("Valore ",i,"counts: ",(dataset[colonna] ==i).sum())

### VISUALIZZAZIONE DELLA DISTRIBUZIONE

In [None]:
def showHistogram(dataset, column_name):
    maxValue = max(dataset[column_name])
    minValue = min(dataset[column_name])
    columnNames = dataset[column_name].unique()
    plt.figure(figsize=(10, 5))
    plt.hist(dataset[column_name],bins=300, color='skyblue', edgecolor='black')
    plt.title('Histogram')
    plt.xlabel('Values')
    plt.ylabel('Frequency')
    #plt.xticks(range(minValue,maxValue, len(columnNames)))
    plt.gca().set_xticklabels(columnNames)
    plt.grid(True)
    plt.show()

In [None]:
dtFlorenceAnalysis = pd.read_csv("./datasetsAnalysis/florence_analysis.csv")

In [None]:
dtFlorenceAnalysis = dtFlorenceAnalysis.loc[dtFlorenceAnalysis["number_of_reviews_x"]==0]


In [None]:
featureDelete = ['neighbourhood_group_cleansed',"neighbourhood", "amenities_0","amenities_1","id_0","id_1","label"]
dtFlorenceAnalysis.drop(featureDelete,axis=1,inplace=True)
dtFlorenceAnalysis.drop(dtFlorenceAnalysis.columns[0:13],axis=1,inplace=True)

# MODELLO DI APPRENDIMENTO

In [None]:
X = dtFlorenceAnalysis.iloc[:,0:37]
y = dtFlorenceAnalysis[["price_1"]]

In [None]:
X.drop(["price_0"],axis=1, inplace=True)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
scaler = StandardScaler()
X_train,X_test,y_train,y_test  = scaler.fit_transform(X_train),scaler.fit_transform(X_test),scaler.fit_transform(y_train),scaler.fit_transform(y_test)


## LINEAR SVR

In [None]:

regr =  LinearSVR(dual=True, random_state=42, tol=1e-2,loss='squared_epsilon_insensitive')
regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)

mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

## RANDOMFOREST

In [None]:
# Initialize the Random Forest 
regressor = RandomForestRegressor(n_estimators=1000, random_state=42)

# Train the 
regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = regressor.predict(X_test)
r2 = r2_score(y_test,y_pred)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
print(f'R-squared: {r2}')

In [None]:
regr.coef_

In [None]:
regressor.feature_importances_

# PLOT DEI RISULTATI OTTENUTI 

In [None]:
coloumnNumber = X.shape[1]
columnNames = dtFlorenceAnalysis.iloc[:,0:coloumnNumber].columns.to_list()
plt.figure(figsize=(15.75,11.81))
plt.bar(range(0,coloumnNumber,1),regr.coef_)
plt.xticks(range(0,coloumnNumber,1),columnNames)
plt.gca().set_xticklabels(columnNames)
plt.xticks(rotation=90)
plt.grid(True)
plt.show()