In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
print(train_data[['Pclass','Cabin','Ticket']])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors = 5)

# Calcolare il numero di passeggeri per ciascun porto e sopravvivenza
embarked_survival_counts = train_data.groupby(['Embarked', 'Survived']).size().reset_index(name='Count')

# Calcolare le percentuali rispetto al totale dei passeggeri imbarcati in ogni porto
embarked_total_counts = train_data.groupby('Embarked')['PassengerId'].count().reset_index(name='Total')
embarked_survival_counts = pd.merge(embarked_survival_counts, embarked_total_counts, on='Embarked')
embarked_survival_counts['Percentage'] = (embarked_survival_counts['Count'] / embarked_survival_counts['Total']) * 100

# Crea una griglia di 2 grafici su una riga
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Creare un barplot per Embarked e Survived
sns.countplot(data=train_data, x='Embarked', hue='Survived', palette='Set1', ax=axes[0])

# Aggiungere titolo e etichette
axes[0].set_title('Survival Count Based on Embarked Location')
axes[0].set_xlabel('Port of Embarkation')
axes[0].set_ylabel('Number of Passengers')

sns.barplot(data=embarked_survival_counts, x='Embarked', y='Percentage', hue='Survived', palette='Set1')

# Aggiungere titolo e etichette
axes[1].set_title('Survival Percentage Based on Embarked Location')
axes[1].set_xlabel('Port of Embarkation')
axes[1].set_ylabel('Percentage of Passengers')

# Regolazione del layout per evitare sovrapposizioni
plt.tight_layout()

# Mostrare il grafico
plt.show()


# Crea una griglia di 3 grafici su una riga
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Istogramma di Fare
axes[0].hist(train_data['Fare'], bins=50, color='blue', edgecolor='black')
axes[0].set_title('Distribuzione della variabile Fare')
axes[0].set_xlabel('Fare')
axes[0].set_ylabel('Frequenza')

# Boxplot di Fare
sns.boxplot(x=train_data['Fare'], color='lightblue', ax=axes[1])
axes[1].set_title('Boxplot della variabile Fare')
axes[1].set_xlabel('Fare')

# Boxplot di Fare suddiviso per Survived
sns.boxplot(x='Survived', y='Fare', data=train_data, ax=axes[2])
axes[2].set_title('Distribuzione di Fare per Survived')
axes[2].set_xlabel('Survived')
axes[2].set_ylabel('Fare')

# Regolazione del layout per evitare sovrapposizioni
plt.tight_layout()

# Visualizzazione dei grafici
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt

# Creiamo dei bin per la variabile Fare per semplificare la visualizzazione
train_data['Fare_bin'] = pd.qcut(train_data['Fare'], 4)  # Divide Fare in quartili

# Creiamo una tabella pivot per visualizzare la sopravvivenza in base a Pclass e Fare_bin
pivot_table = train_data.pivot_table(values='Survived', index='Pclass', columns='Fare_bin', aggfunc='mean', observed=False)

# Visualizziamo la heatmap
fig, axes = plt.subplots(1, 2, figsize=(18, 6))
sns.heatmap(pivot_table, annot=True, cmap='YlGnBu', ax=axes[0])
axes[0].set_title("Heatmap delle interazioni tra Pclass e Fare rispetto alla sopravvivenza")

sns.scatterplot(x='Pclass', y='Fare', hue='Survived', data=train_data, ax=axes[1])
axes[1].set_title('Scatter plot Pclass and Fare for Survived label')

plt.show()

test_data['Fare'] = imputer.fit_transform(test_data[['Fare']])
train_data['Pclass_Fare'] =  train_data['Fare']/train_data['Pclass'] 
test_data['Pclass_Fare'] =  test_data['Fare']/test_data['Pclass'] 

# Istogramma della variabile Fare*Pclass
fig, axes = plt.subplots(1, 2, figsize=(18, 6))
axes[0].hist(train_data['Pclass_Fare'], bins=30, color='skyblue', edgecolor='black')
axes[0].set_title('Histogram of Pclass*Fare')
axes[0].set_xlabel('Pclass/Fare')
axes[0].set_ylabel('Frequency')

# Boxplot della variabile Fare*Pclass
axes[1].boxplot(train_data['Pclass_Fare'], vert=False)
axes[1].set_title('Boxplot of Pclass/Fare')
axes[1].set_xlabel('Pclass/Fare')

plt.tight_layout()
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Crea una nuova colonna 'TicketCategory' con la prima lettera del biglietto, o categorizzazioni per mancanza di lettera
train_data['TicketCategory'] = train_data['Ticket'].apply(lambda x: str(x)[0] if pd.notnull(x) and str(x)[0].isalpha() else 'NoLetter' if pd.notnull(x) else 'Missing')
test_data['TicketCategory'] = test_data['Ticket'].apply(lambda x: str(x)[0] if pd.notnull(x) and str(x)[0].isalpha() else 'NoLetter' if pd.notnull(x) else 'Missing')
# Calcola la probabilità di sopravvivenza per ciascuna categoria di biglietto
ticket_survival = train_data.groupby('TicketCategory')['Survived'].mean() * 100  # Percentuale di sopravvivenza

# Ordina per comodità di visualizzazione
ticket_survival = ticket_survival.sort_values()

# Crea il grafico
fig, axes = plt.subplots(1, 2, figsize=(18, 6))
sns.barplot(x=ticket_survival.index, y=ticket_survival.values, palette="viridis", ax=axes[0])
axes[0].set_xlabel("Ticket Category")
axes[0].set_ylabel("Survival Probability (%)")
axes[0].set_title("Survival Probability by Ticket Category")

# Crea una tabella di frequenza per le categorie di biglietto e la classe
ticket_class_counts = train_data.groupby(['TicketCategory', 'Pclass']).size().unstack(fill_value=0)

# Crea il grafico
ticket_class_counts.plot(kind="bar", stacked=True, colormap="viridis", width=0.8, ax=axes[1])
axes[1].set_xlabel("Ticket Category")
axes[1].set_ylabel("Number of Passengers")
axes[1].set_title("Ticket Category Distribution by Class")
axes[1].legend(title="Class", labels=["1st", "2nd", "3rd"])
plt.xticks(rotation=45)
plt.show()


In [None]:
train_data['Age'] = imputer.fit_transform(train_data[['Age']])
test_data['Age'] = imputer.fit_transform(test_data[['Age']])


# Conversione della colonna "Sex"
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})

#Gestione titolo
# Estrazione del titolo dal nome
train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_data['Title'] = test_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Raggruppa titoli rari
train_data['Title'] = train_data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col',
                                                   'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train_data['Title'] = train_data['Title'].replace('Mlle', 'Miss')
train_data['Title'] = train_data['Title'].replace('Ms', 'Miss')
train_data['Title'] = train_data['Title'].replace('Mme', 'Mrs')

test_data['Title'] = test_data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col',
                                                   'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
test_data['Title'] = test_data['Title'].replace('Mlle', 'Miss')
test_data['Title'] = test_data['Title'].replace('Ms', 'Miss')
test_data['Title'] = test_data['Title'].replace('Mme', 'Mrs')

# Converti i titoli in numeri
train_data = pd.get_dummies(train_data, columns=['Title'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Title'], drop_first=True)

#Creazione FamilySize e IsAlone feature
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

train_data.loc[train_data['FamilySize'] > 1, 'IsAlone'] = 0
train_data.loc[train_data['FamilySize'] == 1, 'IsAlone'] = 1
test_data.loc[train_data['FamilySize'] > 1, 'IsAlone'] = 0
test_data.loc[train_data['FamilySize'] == 1, 'IsAlone'] = 1

#Creazione di AgeGroup
train_data['AgeGroup'] = pd.cut(train_data['Age'], bins=[0.0, 1.0, 4.0, 11.0, 17.0, 24.0, 64.0, 100.0],
                 labels=['Infants', 'Toddlers', 'Children', 'Adolescents', 
                         'Young Adults', 'Adults', 'Seniors'])
test_data['AgeGroup'] = pd.cut(test_data['Age'], bins=[0.0, 1.0, 4.0, 11.0, 17.0, 24.0, 64.0, 100.0],
                 labels=['Infants', 'Toddlers', 'Children', 'Adolescents', 
                         'Young Adults', 'Adults', 'Seniors'])


# Calcolo della percentuale di sopravvivenza per gruppo di età
age_survival_df = train_data.groupby('AgeGroup', observed=False)['Survived'].mean().reset_index()
age_survival_df['Survived'] = age_survival_df['Survived'] * 100  # Converti in percentuale

# Grafico
plt.figure(figsize=(10, 6))
sns.barplot(data=age_survival_df, x='AgeGroup', y='Survived', palette="viridis")
plt.title("Probabilità di Sopravvivenza per Gruppo di Età")
plt.xlabel("Gruppo di Età")
plt.ylabel("Percentuale di Sopravvivenza")
plt.show()

# Crea le variabili dummies per 'AgeGroup'
train_data = pd.get_dummies(train_data, columns=['AgeGroup'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['AgeGroup'], drop_first=True)

#Crea le variabili dummies per 'Embarked'
train_data = pd.get_dummies(train_data, columns=['Embarked'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Embarked'], drop_first=True)

#Crea le variabili dummiese per 'TicketCategory'
train_data = pd.get_dummies(train_data, columns=['TicketCategory'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['TicketCategory'], drop_first=True)

'''
#Dividere la variabile Fare in Bin e Creazione variabili Dummies
train_data['FareBin'] = pd.cut(train_data['Fare'], bins=[0, 7.91, 14.454, 31, 512], 
                               labels=['Low', 'Medium', 'High', 'Very High'])
test_data['FareBin'] = pd.cut(test_data['Fare'], bins=[0, 7.91, 14.454, 31, 512], 
                               labels=['Low', 'Medium', 'High', 'Very High'])
train_data = pd.get_dummies(train_data, columns=['FareBin'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['FareBin'], drop_first=True)

#GetDummies per la variabile Pclass per aiutare alcuni modelli
train_data = pd.get_dummies(train_data, columns=['Pclass'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Pclass'], drop_first=True)
'''

# Verifica il risultato
print(train_data.head())

import seaborn as sns
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

sns.boxplot(data=train_data, x='Pclass', y='Fare', hue='FamilySize', ax=axes[0])
axes[0].set_title("Fare per Classe e Dimensione della Famiglia")


# Calcolare la matrice di correlazione
corr_matrix = train_data[['Fare', 'FamilySize', 'Pclass']].corr()

sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, ax=axes[1])
axes[1].set_title("Correlazione tra Fare, Classe e FamilySize")


sns.kdeplot(data=train_data, x='Fare', hue='Pclass', multiple='stack', ax=axes[2])
axes[2].set_title("Distribuzione del Fare per Classe")

plt.show()


In [None]:
from sklearn.model_selection import train_test_split
train_data=train_data.drop(['Name','Cabin','Ticket', 'SibSp','Parch','Age','Fare_bin','Pclass', 'Fare'],axis=1)
test_data=test_data.drop(['Name','Cabin','Ticket', 'SibSp','Parch','Age','Pclass','Fare'],axis=1)
X_val = test_data.drop(columns = ['PassengerId'])
X = train_data.drop(columns=['Survived','PassengerId'])
y = train_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np
X_train_dta = X_train
X_test_dta = X_test
X_val_dta = X_val
X_train_dta.head()


clf1 = LogisticRegression(penalty = 'l2', C = 0.1, solver = 'saga', random_state = 1, class_weight = 'balanced')
clf2 = DecisionTreeClassifier(max_depth = 5, criterion = 'entropy', random_state = 0, class_weight = 'balanced')
clf3 = KNeighborsClassifier(n_neighbors = 7, p = 2, metric = 'minkowski')
clf4 = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
clf5 = xgb.XGBClassifier(n_estimators = 1500, learning_rate =0.08, max_depth=5, eval_metric='logloss', random_state=42)
pipe1 = Pipeline([['sc', StandardScaler()], ['clf1', clf1]])
pipe2 = Pipeline([['clf2', clf2]])
pipe3 = Pipeline([['sc', StandardScaler()],['clf3', clf3]])
pipe4 = Pipeline([['sc', StandardScaler()], ['clf4', clf4]])
pipe5 = Pipeline([['sc', StandardScaler()],['clf5', clf5]])
#'Logistic regression', 'Decision Tree',
clf_labels = ['Logistic regression', 'Decision Tree', 'KNN', 'Random Forest', 'XGBoost']
print('10-fold cross validation:\n')
pipes = [pipe1, pipe2, pipe3,pipe4, pipe5]

# Festure importance per RandomForestClassifier e XGboost

# 1. Addestramento del modello
X_train_dta_np = X_train_dta.values  # .values restituisce l'array senza i nomi delle feature

pipe1.fit(X_train_dta, y_train)  # Logistic Regression
pipe2.fit(X_train_dta_np, y_train)  # Decision Tree
pipe3.fit(X_train_dta, y_train)  # KNN
pipe4.fit(X_train_dta, y_train)  # Random Forest
pipe5.fit(X_train_dta, y_train)  # XGBoost

#clf4.fit(X_train, y_train)

# 2. Estrazione delle importanze
importances = clf4.feature_importances_

# 3. Visualizzazione delle importanze
# Creiamo un dataframe per le importanze
feat_importances = pd.Series(importances, index=X_train_dta.columns)
feat_importances = feat_importances.sort_values(ascending=False)

# Grafico
plt.figure(figsize=(10, 6))
feat_importances.plot(kind='bar')
plt.title('Feature Importance - Random Forest')
plt.show()

# 1. Addestramento del modello
clf5.fit(X_train_dta, y_train)

# 2. Visualizzazione delle importanze (usando plot_importance)
plt.figure(figsize=(10, 6))
xgb.plot_importance(clf5, max_num_features=15)  # Mostra le 10 feature più importanti
plt.title('Feature Importance - XGBoost')
plt.show()

from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

# Lista di classificatori e nomi

classifiers = {
    'Logistic Regression': pipe1.named_steps['clf1'],
    'Decision Tree': pipe2.named_steps['clf2'],
    'KNN': pipe3.named_steps['clf3'],
    'Random Forest': pipe4.named_steps['clf4'],
    'XGBoost': pipe5.named_steps['clf5']
}

# Per ogni classificatore, calcola l'importanza delle feature e visualizza il grafico
# Dizionario per salvare le importanze delle feature per ciascun modello
feature_importances = pd.DataFrame()

# Calcolo delle importanze delle feature per ciascun modello
for name, clf in classifiers.items():
    results = permutation_importance(clf, X_train_dta_np, y_train, n_repeats=10, random_state=42, n_jobs=-1)
    feat_importances = pd.Series(results.importances_mean, index=X_train_dta.columns)
    
    # Aggiungiamo i risultati al DataFrame
    feature_importances[name] = feat_importances

# Trasformiamo il DataFrame per avere un formato lungo (necessario per il grafico)
feature_importances = feature_importances.reset_index().melt(id_vars="index", 
                                                             var_name="Modello", 
                                                             value_name="Importanza")
feature_importances = feature_importances.rename(columns={"index": "Feature"})

# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(data=feature_importances, x="Feature", y="Importanza", hue="Modello")
plt.xticks(rotation=45, ha='right')
plt.title("Importanza delle feature per ciascun modello")
plt.ylabel("Mean decrease in accuracy")
plt.legend(title="Modello")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
features_lr = ['Title_Mr','Pclass_Fare', 'Sex', 'AgeGroup_Adults','AgeGroup_Young Adults','AgeGroup_Adolescents','Embarked_S','Embarked_Q']
features_dt = ['Title_Mr','Pclass_Fare','FamilySize','Title_Rare','AgeGroup_Young Adults','TicketCategory_S', 'TicketCategory_W']
features_knn = ['Pclass_Fare','Title_Mr','Sex']
features_rf = ['Sex','FamilySize','Title_Mrs','Title_Miss']
features_xg = ['Pclass_Fare','Title_Mr','FamilySize','AgeGroup_Adults','Embarked_S','TicketCategory_NoLetter','TicketCategory_S','AgeGroup_Young Adults','AgeGroup_Children','Embarked_Q']

X_train_dta_LR = X_train_dta[features_lr]
X_train_dta_DT = X_train_dta[features_dt]
X_train_dta_KNN = X_train_dta[features_knn]
X_train_dta_RF = X_train_dta[features_rf]
X_train_dta_XG = X_train_dta[features_xg]

X_test_dta_LR = X_test_dta[features_lr]
X_test_dta_DT = X_test_dta[features_dt]
X_test_dta_KNN = X_test_dta[features_knn]
X_test_dta_RF = X_test_dta[features_rf]
X_test_dta_XG = X_test_dta[features_xg]

X_val_dta_LR = X_val_dta[features_lr]
X_val_dta_DT = X_val_dta[features_dt]
X_val_dta_KNN = X_val_dta[features_knn]
X_val_dta_RF = X_val_dta[features_rf]
X_val_dta_XG = X_val_dta[features_xg]

#Valutazione Accuracy sui singoli modelli con le features specifiche
print('10-fold cross validation:\n')
scores_LR = cross_val_score(estimator=pipe1, X = X_train_dta_LR, y=y_train, cv=10, scoring='accuracy')
print(f'Accuracy: {scores_LR.mean():.2f}'
      f'(+/- {scores_LR.std():-2f}) Logistic Regression')

scores_DT = cross_val_score(estimator=pipe2, X = X_train_dta_DT, y=y_train, cv=10, scoring='accuracy')
print(f'Accuracy: {scores_DT.mean():.2f}'
      f'(+/- {scores_DT.std():-2f}) Decision Tree')

scores_KNN = cross_val_score(estimator=pipe3, X = X_train_dta_KNN, y=y_train, cv=10, scoring='accuracy')
print(f'Accuracy: {scores_KNN.mean():.2f}'
      f'(+/- {scores_KNN.std():-2f}) KNN')

scores_RF = cross_val_score(estimator=pipe4, X = X_train_dta_RF, y=y_train, cv=10, scoring='accuracy')
print(f'Accuracy: {scores_RF.mean():.2f}'
      f'(+/- {scores_RF.std():-2f}) Random Forest')

scores_XG = cross_val_score(estimator=pipe5, X = X_train_dta_XG, y=y_train, cv=10, scoring='accuracy')
print(f'Accuracy: {scores_XG.mean():.2f}'
      f'(+/- {scores_XG.std():-2f}) XGBoost')

#Allenamento modelli, fit e valutazione accuracy
pipe1.fit(X_train_dta_LR, y_train)  # Logistic Regression
pipe2.fit(X_train_dta_DT, y_train)  # Decision Tree
pipe3.fit(X_train_dta_KNN, y_train)  # KNN
pipe4.fit(X_train_dta_RF, y_train)  # Random Forest
pipe5.fit(X_train_dta_XG, y_train)  # XGBoost

y_train_dta_LR = pipe1.predict(X_train_dta_LR)
y_train_dta_DT = pipe2.predict(X_train_dta_DT)
y_train_dta_KNN = pipe3.predict(X_train_dta_KNN)
y_train_dta_RF = pipe4.predict(X_train_dta_RF)
y_train_dta_XG = pipe5.predict(X_train_dta_XG)

y_test_dta_LR = pipe1.predict(X_test_dta_LR)
y_test_dta_DT = pipe2.predict(X_test_dta_DT)
y_test_dta_KNN = pipe3.predict(X_test_dta_KNN)
y_test_dta_RF = pipe4.predict(X_test_dta_RF)
y_test_dta_XG = pipe5.predict(X_test_dta_XG)

y_val_dta_LR = pipe1.predict(X_val_dta_LR)
y_val_dta_DT = pipe2.predict(X_val_dta_DT)
y_val_dta_KNN = pipe3.predict(X_val_dta_KNN)
y_val_dta_RF = pipe4.predict(X_val_dta_RF)
y_val_dta_XG = pipe5.predict(X_val_dta_XG)

#Assemblaggio delle predizione e valutazione delle accuracy
y_pred_dta_train = np.round((scores_LR.mean()*y_train_dta_LR + scores_DT.mean()*y_train_dta_DT + scores_KNN.mean()*y_train_dta_KNN + 
                                scores_RF.mean()*y_train_dta_RF + scores_XG.mean()*y_train_dta_XG) / (scores_LR.mean()+scores_DT.mean()+scores_KNN.mean()+scores_RF.mean()+scores_XG.mean())).astype(int)
y_pred_dta_test = np.round((scores_LR.mean()*y_test_dta_LR + scores_DT.mean()*y_test_dta_DT + scores_KNN.mean()*y_test_dta_KNN + 
                                scores_RF.mean()*y_test_dta_RF + scores_XG.mean()*y_test_dta_XG) / (scores_LR.mean()+scores_DT.mean()+scores_KNN.mean()+scores_RF.mean()+scores_XG.mean())).astype(int)

y_pred_dta_val = np.round((scores_LR.mean()*scores_LR.mean()*y_val_dta_LR + scores_DT.mean()*y_val_dta_DT + scores_KNN.mean()*y_val_dta_KNN + 
                                scores_RF.mean()*y_val_dta_RF + scores_XG.mean()*y_val_dta_XG) / (scores_LR.mean()+scores_DT.mean()+scores_KNN.mean()+scores_RF.mean()+scores_XG.mean())).astype(int)

fp_dta_train_acc = accuracy_score(y_train, y_pred_dta_train)
fp_dta_test_acc = accuracy_score(y_test, y_pred_dta_test)

print(f'Accuracy_train/accuracy_test '
      f'{fp_dta_train_acc:.3f}/{fp_dta_test_acc:.3f}')

from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
import numpy as np

# 1. Addestrare i modelli di base e ottenere le previsioni su X_train e X_val
train_meta_features = [y_train_dta_LR, y_train_dta_DT, y_train_dta_KNN, y_train_dta_RF, y_train_dta_XG]
val_meta_features = [y_val_dta_LR, y_val_dta_DT, y_val_dta_KNN, y_val_dta_RF, y_val_dta_XG]
test_meta_features = [y_test_dta_LR, y_test_dta_DT, y_test_dta_KNN, y_test_dta_RF, y_test_dta_XG]


# Trasforma le liste in matrici per il meta-modello
train_meta_features = np.array(train_meta_features).T  # Trasposta per ottenere una riga per ciascun esempio
val_meta_features = np.array(val_meta_features).T
test_meta_features = np.array(test_meta_features).T

# 2. Addestrare il meta-modello Bernoulli Naive Bayes
meta_model = BernoulliNB()
meta_model.fit(train_meta_features, y_train)

# 3. Valutare il meta-modello
train_preds = meta_model.predict(train_meta_features)
val_preds = meta_model.predict(val_meta_features)
test_preds = meta_model.predict(test_meta_features)

# Calcolare l'accuracy
train_accuracy = accuracy_score(y_train, train_preds)
test_accuracy = accuracy_score(y_test, test_preds)

print(f"Train Accuracy del meta-modello: {train_accuracy:.4f}")
print(f"Test Accuracy del meta-modello: {test_accuracy:.4f}")

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': val_preds})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")


from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score

# Assumiamo che le feature siano già suddivise per ciascun modello
# Dividi X_train, X_test, X_val nelle feature specifiche per ciascun modello

# Sottinsiemi di feature per ciascun modello (esempio)
X_train_lr, X_test_lr, X_val_lr = X_train[features_lr], X_test[features_lr], X_val[features_lr]
X_train_dt, X_test_dt, X_val_dt = X_train[features_dt], X_test[features_dt], X_val[features_dt]
X_train_knn, X_test_knn, X_val_knn = X_train[features_knn], X_test[features_knn], X_val[features_knn]
X_train_rf, X_test_rf, X_val_rf = X_train[features_rf], X_test[features_rf], X_val[features_rf]
X_train_xg, X_test_xg, X_val_xg = X_train[features_xg], X_test[features_xg], X_val[features_xg]

# Creazione dei modelli base
lr = LogisticRegression(max_iter=1000, random_state=42)
dt = DecisionTreeClassifier(random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
#rf = RandomForestClassifier(n_estimators=100, random_state=42)
#xg = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Dizionario con i modelli e i rispettivi subset di feature
base_classifiers = [
    ('lr', lr, X_train_lr, X_test_lr, X_val_lr),
   # ('dt', dt, X_train_dt, X_test_dt, X_val_dt),
    ('knn', knn, X_train_knn, X_test_knn, X_val_knn),
   # ('rf', rf, X_train_rf, X_test_rf, X_val_rf),
   # ('xg', xg, X_train_xg, X_test_xg, X_val_xg)
]

# Addestramento dei modelli base sui rispettivi subset di feature
for name, clf, X_tr, X_te, X_va in base_classifiers:
    print(f"Addestramento del modello {name}...")
    clf.fit(X_tr, y_train)

# Predizione con i modelli base per ottenere le probabilità come input al meta-modello
train_meta_features = []
val_meta_features = []
test_meta_features = []

for name, clf, X_tr, X_te, X_va in base_classifiers:
    train_meta_features.append(clf.predict_proba(X_tr)[:, 1].reshape(-1, 1))
    val_meta_features.append(clf.predict_proba(X_va)[:, 1].reshape(-1, 1))
    test_meta_features.append(clf.predict_proba(X_te)[:, 1].reshape(-1, 1))

# Stack delle feature per il meta-modello
train_meta = np.hstack(train_meta_features)
val_meta = np.hstack(val_meta_features)
test_meta = np.hstack(test_meta_features)

# Definizione e addestramento del meta-modello (Boosting con XGBoost)
meta_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
meta_model.fit(train_meta, y_train)

# Valutazione delle performance
train_predictions = meta_model.predict(train_meta)
val_predictions = meta_model.predict(val_meta)
test_predictions = meta_model.predict(test_meta)

print("Accuracy su Training set:", accuracy_score(y_train, train_predictions))
#print("Accuracy su Validation set:", accuracy_score(y_val, val_predictions))
print("Accuracy su Test set:", accuracy_score(y_test, test_predictions))


from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Supponiamo di avere già diviso il dataset in train, validation e test
# train_data è il dataset completo


# Divisione dei dati in train, validation e test set
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# Creazione dei modelli base con Bagging
bagging_lr = BaggingClassifier(estimator=pipe1, n_estimators=10, random_state=42)
bagging_dt = BaggingClassifier(estimator=pipe2, n_estimators=10, random_state=42)
bagging_knn = BaggingClassifier(estimator=pipe3, n_estimators=10, random_state=42)
bagging_rf = BaggingClassifier(estimator=pipe4, n_estimators=10, random_state=42)
bagging_xg = BaggingClassifier(estimator=pipe5, n_estimators=10, random_state=42)

# Addestramento dei modelli
bagging_lr.fit(X_train[features_lr], y_train)
bagging_dt.fit(X_train[features_dt], y_train)
bagging_knn.fit(X_train[features_knn], y_train)
bagging_rf.fit(X_train[features_rf], y_train)
bagging_xg.fit(X_train[features_xg], y_train)

# Predizioni sui set di validation e test
train_preds_lr = bagging_lr.predict(X_train[features_lr])
train_preds_dt = bagging_dt.predict(X_train[features_dt])
train_preds_knn = bagging_knn.predict(X_train[features_knn])
train_preds_rf = bagging_rf.predict(X_train[features_rf])
train_preds_xg = bagging_xg.predict(X_train[features_xg])

test_preds_lr = bagging_lr.predict(X_test[features_lr])
test_preds_dt = bagging_dt.predict(X_test[features_dt])
test_preds_knn = bagging_knn.predict(X_test[features_knn])
test_preds_rf = bagging_rf.predict(X_test[features_rf])
test_preds_xg = bagging_xg.predict(X_test[features_xg])

val_preds_lr = bagging_lr.predict(X_val_dta_LR)
val_preds_dt = bagging_dt.predict(X_val_dta_DT)
val_preds_knn = bagging_knn.predict(X_val_dta_KNN)
val_preds_rf = bagging_rf.predict(X_val_dta_RF)
val_preds_xg = bagging_xg.predict(X_val_dta_XG)

# Combina le predizioni per majority voting (modello di ensemble finale)
import numpy as np

train_preds = np.array([train_preds_lr, train_preds_dt, train_preds_knn, train_preds_rf, train_preds_xg])
test_preds = np.array([test_preds_lr, test_preds_dt, test_preds_knn, test_preds_rf, test_preds_xg])
val_preds = np.array([val_preds_lr, val_preds_dt, val_preds_knn, val_preds_rf, val_preds_xg])

# Majority voting
train_final_preds = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=train_preds)
test_final_preds = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=test_preds)
val_final_preds = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=val_preds)

# Valutazione dell'accuracy finale
train_accuracy = accuracy_score(y_train, train_final_preds)
test_accuracy = accuracy_score(y_test, test_final_preds)

print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y_pred_dta_val})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")


'''
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Impostiamo il classificatore base e il Bagging model
estimator = DecisionTreeClassifier()
bagging_model = BaggingClassifier(estimator=estimator, random_state=42)

# Definiamo la griglia di parametri
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'estimator__max_depth': [3, 5, 10, None],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0]
}

# Usare GridSearchCV per trovare la combinazione migliore
grid_search = GridSearchCV(bagging_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Stampa i migliori parametri trovati e l'accuracy corrispondente
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Valutazione su set di test e validazione
best_bagging_model = grid_search.best_estimator_
train_accuracyGV = best_bagging_model.score(X_train, y_train)
test_accuracyGV = best_bagging_model.score(X_test, y_test)


print(f"Validation Accuracy GV: {train_accuracyGV:.4f}")
print(f"Test Accuracy GV: {test_accuracyGV:.4f}")
'''

#Loss Curve for Logistic Regression
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Definisci l'intervallo di valori di C da testare
C_values = np.logspace(-4, 4, 20)  # da 10^-4 a 10^4
losses = []

# Calcola la perdita media per ogni valore di C
for C in C_values:
    clf = LogisticRegression(C=C, max_iter=1000)
    # Cross-entropy loss (negative log-likelihood)
    loss = -cross_val_score(clf, X_train_dta_LR, y_train, cv=5, scoring='neg_log_loss').mean()
    losses.append(loss)

# Plot della loss curve
plt.figure(figsize=(10, 6))
plt.plot(C_values, losses, marker='o')
plt.xscale('log')
plt.xlabel('C (Regularization parameter)')
plt.ylabel('Log Loss')
plt.title('Loss Curve for Logistic Regression')
plt.show()

'''
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np

#Best parameters: {'pipe3__clf__n_neighbors': 7, 
#'pipe5__clf__learning_rate': 0.06, 'pipe5__clf__n_estimators': 2000}

clf1 = LogisticRegression(penalty = 'l2', C = 0.001, solver = 'lbfgs', random_state = 1, class_weight = 'balanced')
clf2 = DecisionTreeClassifier(max_depth = 1, criterion = 'entropy', random_state = 0, class_weight = 'balanced')
clf3 = KNeighborsClassifier(n_neighbors = 7, p = 2, metric = 'minkowski')
clf4 = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=1)
clf5 = xgb.XGBClassifier(n_estimators = 1500, learning_rate =0.06, max_depth=5, random_state=42)
pipe1 = Pipeline([['sc', StandardScaler()], ['clf1', clf1]])
pipe2 = Pipeline([['clf2', clf2]])
pipe3 = Pipeline([['sc', StandardScaler()],['clf3', clf3]])
pipe4 = Pipeline([['sc', StandardScaler()], ['clf4', clf4]])
pipe5 = Pipeline([['sc', StandardScaler()],['clf5', clf5]])
#'Logistic regression', 'Decision Tree',
clf_labels = ['Logistic regression', 'Decision Tree', 'KNN', 'Random Forest', 'XGBoost']
print('10-fold cross validation:\n')
pipes = [pipe1, pipe2, pipe3,pipe4, pipe5]
for clf, label in zip(pipes, clf_labels):
    scores = cross_val_score(estimator=clf, X = X_train, y=y_train, cv=10, scoring='accuracy')
    print(f'Accuracy: {scores.mean():.2f}'
         f'(+/- {scores.std():-2f})[{label}]')


# Festure importance per RandomForestClassifier e XGboost

# 1. Addestramento del modello
X_train_np = X_train.values  # .values restituisce l'array senza i nomi delle feature

pipe1.fit(X_train, y_train)  # Logistic Regression
pipe2.fit(X_train_np, y_train)  # Decision Tree
pipe3.fit(X_train, y_train)  # KNN
pipe4.fit(X_train, y_train)  # Random Forest
pipe5.fit(X_train, y_train)  # XGBoost

#clf4.fit(X_train, y_train)

# 2. Estrazione delle importanze
importances = clf4.feature_importances_

# 3. Visualizzazione delle importanze
# Creiamo un dataframe per le importanze
feat_importances = pd.Series(importances, index=X_train.columns)
feat_importances = feat_importances.sort_values(ascending=False)

# Grafico
plt.figure(figsize=(10, 6))
feat_importances.plot(kind='bar')
plt.title('Feature Importance - Random Forest')
plt.show()

# 1. Addestramento del modello
clf5.fit(X_train, y_train)

# 2. Visualizzazione delle importanze (usando plot_importance)
plt.figure(figsize=(10, 6))
xgb.plot_importance(clf5, max_num_features=10)  # Mostra le 10 feature più importanti
plt.title('Feature Importance - XGBoost')
plt.show()

from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

# Lista di classificatori e nomi

classifiers = {
    'Logistic Regression': pipe1.named_steps['clf1'],
    'Decision Tree': pipe2.named_steps['clf2'],
    'KNN': pipe3.named_steps['clf3'],
    'Random Forest': pipe4.named_steps['clf4'],
    'XGBoost': pipe5.named_steps['clf5']
}

# Per ogni classificatore, calcola l'importanza delle feature e visualizza il grafico
# Dizionario per salvare le importanze delle feature per ciascun modello
feature_importances = pd.DataFrame()

# Calcolo delle importanze delle feature per ciascun modello
for name, clf in classifiers.items():
    results = permutation_importance(clf, X_train_np, y_train, n_repeats=10, random_state=42, n_jobs=-1)
    feat_importances = pd.Series(results.importances_mean, index=X_train.columns)
    
    # Aggiungiamo i risultati al DataFrame
    feature_importances[name] = feat_importances

# Trasformiamo il DataFrame per avere un formato lungo (necessario per il grafico)
feature_importances = feature_importances.reset_index().melt(id_vars="index", 
                                                             var_name="Modello", 
                                                             value_name="Importanza")
feature_importances = feature_importances.rename(columns={"index": "Feature"})

# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(data=feature_importances, x="Feature", y="Importanza", hue="Modello")
plt.xticks(rotation=45, ha='right')
plt.title("Importanza delle feature per ciascun modello")
plt.ylabel("Mean decrease in accuracy")
plt.legend(title="Modello")
plt.tight_layout()
plt.show()
'''

'''
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier

#'pipe1__clf__C': [0.001, 0.01, 0.1, 1],
#    'pipe2__clf__max_depth': [1, 2, 3, 5],
# Definisci il dizionario dei parametri per il tuning
param_grid = {
    'pipe1__clf1__C': [0.01, 0.1],
    'pipe2__clf2__max_depth': [5, 8],
    'pipe3__clf3__n_neighbors': [7, 3],
    'pipe4__clf4__n_estimators': [100, 200],
    'pipe4__clf4__max_depth': [5, 7],
    'pipe5__clf5__learning_rate': [0.08, 0.01],
    'pipe5__clf5__n_estimators': [1500, 100]
}

# Crea il VotingClassifier con le pipeline
#('pipe1', pipe1), ('pipe2', pipe2), ('pipe4', pipe4),
voting_clf = VotingClassifier(estimators=[
    ('pipe1', pipe1), ('pipe2', pipe2),
    ('pipe3', pipe3), 
    ('pipe4', pipe4),
    ('pipe5', pipe5)], voting='hard')

# Inizia la ricerca grid search
grid = GridSearchCV(estimator=voting_clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train_dta, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Best cross-validation accuracy: {grid.best_score_}")
'''

Best parameters: {'pipe1__clf1__C': 0.01, 'pipe2__clf2__max_depth': 5, 'pipe3__clf3__n_neighbors': 7, 'pipe4__clf4__max_depth': 5, 'pipe4__clf4__n_estimators': 100, 'pipe5__clf5__learning_rate': 0.08, 'pipe5__clf5__n_estimators': 1500}
Best cross-validation accuracy: 0.830089628681178

'''
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

# Creazione del VotingClassifier con i classificatori
#('lr', pipe1), 
#    ('DTC', pipe2),
voting_clf = VotingClassifier(estimators=[ ('lr', pipe1),('DTC', pipe2),   
    ('KNN', pipe3), 
    ('RF', pipe4), 
    ('XG', pipe5)], voting='hard')

# Addestramento
voting_clf.fit(X_train_dta, y_train)


from sklearn.inspection import PartialDependenceDisplay

# Modifica la chiamata
pipe4.fit(X_train, y_train)
PartialDependenceDisplay.from_estimator(pipe4, X_train, [('Pclass', 'Fare')])

plt.show()

pipe5.fit(X_train, y_train)
PartialDependenceDisplay.from_estimator(pipe5, X_train, [('Pclass', 'Fare')])

plt.show()

# Previsioni
y_train_pred = voting_clf.predict(X_train_dta)
y_test_pred = voting_clf.predict(X_test_dta)
Majoring_vote_train = accuracy_score(y_train, y_train_pred)
Majoring_vote_test = accuracy_score(y_test, y_test_pred)
print(f'MV train/test accuracies '
      f'{Majoring_vote_train:.3f}/{Majoring_vote_test:.3f}')

predictions = voting_clf.predict(X_val_dta)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
'''

'''
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

# Creazione del VotingClassifier con i classificatori
#('lr', pipe1), 
#    ('DTC', pipe2),
voting_clf = VotingClassifier(estimators=[ ('lr', pipe1),('DTC', pipe2),   
    ('KNN', pipe3), 
    ('RF', pipe4), 
    ('XG', pipe5)], voting='hard')

# Addestramento
voting_clf.fit(X_train_dta, y_train)


'''
#Stardadize the features
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import accuracy_score
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
X_val_std = sc.transform(X_val)

lda = LDA(n_components=1)
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1, class_weight='balanced')
X_train_lda = lda.fit_transform(X_train_std, y_train)
X_test_lda = lda.transform(X_test_std)
X_val_lda = lda.transform(X_val_std)
model.fit(X_train_lda, y_train)
y_train_pred = model.predict(X_train_lda)
y_test_pred = model.predict(X_test_lda)
Forest_train = accuracy_score(y_train, y_train_pred)
Forest_test = accuracy_score(y_test, y_test_pred)
print(f'Forest train/test accuracies '
      f'{Forest_train:.3f}/{Forest_test:.3f}')

predictions = model.predict(X_val_lda)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
'''

'''
from sklearn.linear_model import LogisticRegression
​
# Selezionare le feature per il modello della prima e seconda classe
X_train_first_second_class = first_second_class[["Sex", "SibSp", "Parch", "Fare"]]  # Usa le tue feature
X_train_first_second_class = pd.get_dummies(X_train_first_second_class)
y_train_first_second_class = first_second_class["Survived"]
​
# Creare e addestrare il modello per la prima e seconda classe
model_first_second_class = LogisticRegression()
model_first_second_class.fit(X_train_first_second_class, y_train_first_second_class)
y_train_pred_first_second_class = model_first_second_class.predict(X_train_first_second_class)
​
# Selezionare le feature per il modello della terza classe
X_train_third_class = third_class[["Sex", "SibSp", "Parch", "Fare"]]  # Usa le tue feature
X_train_third_class = pd.get_dummies(X_train_third_class)
y_train_third_class = third_class["Survived"]
​
# Creare e addestrare il modello per la prima e seconda classe
model_third_class = LogisticRegression()
model_third_class.fit(X_train_third_class, y_train_third_class)
y_train_pred_third_class = model_third_class.predict(X_train_third_class)
​
​
​
# Feature per ciascuna classe
X_test_first_second_class = pd.get_dummies(first_second_class_test[["Sex", "SibSp", "Parch", "Fare"]])
X_test_third_class = pd.get_dummies(third_class_test[["Sex", "SibSp", "Parch", "Fare"]])
​
# Predizioni per ciascuna classe
y_pred_first_second_class = model_first_second_class.predict(X_test_first_second_class)
y_pred_third_class = model_third_class.predict(X_test_third_class)
​
import numpy as np
​
# Combinare le predizioni per tutte le classi
y_pred_combined = np.concatenate([y_train_pred_first_second_class, y_train_pred_third_class])
y_train = np.concatenate([y_train_first_second_class, y_train_third_class]) 
​
from sklearn.metrics import accuracy_score
​
# Calcolare l'accuratezza globale
global_accuracy = accuracy_score(y_train, y_pred_combined)
print(f'Accuratezza Globale: {global_accuracy:.3f}')
​
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
'''

In [None]:
'''
#Selection Features through PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt

#Stardadize the features
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
X_val_std = sc.transform(X_val)

cov_mat = np.cov(X_train_std.T)
eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
print('\nEigenvalues \n', eigen_vals)

tot = sum(eigen_vals)
var_exp =[(i/tot) for i in sorted(eigen_vals, reverse = True)]
cum_var_exp = np.cumsum(var_exp)

plt.bar(range(1,11), var_exp, align = 'center', label = 'Individual explained variance')
plt.step(range(1,11), cum_var_exp, where = 'mid', label = 'Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Pricipal component index')
plt.legend(loc='best')
plt.tight_layout()
plt.show()
'''

In [None]:
'''
#Feature Transformation
#Make a list of (eigenvalue, eigenvector) tuples
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:,i]) for i in range(len(eigen_vals))]
eigen_pairs.sort(key = lambda k: k[0], reverse = True)
#Selection of the first two eigenvectors
w = np.hstack((eigen_pairs[0][1][:,np.newaxis],
              eigen_pairs[1][1][:,np.newaxis]))
print('Matrix W:\n', w)
X_train_pca = X_train_std.dot(w)

colors = ['r','b']
markers = ['o','s']
for l, c, m in zip(np.unique(y_train), colors, markers):
    plt.scatter(X_train_pca[y_train==l, 0],
               X_train_pca[y_train==l, 1],
               c=c, label=f'Class {l}', marker = m)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend(loc = 'lower left')
plt.tight_layout()
plt.show()
'''
'''
from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

pca = PCA(n_components=2)
#lr = LogisticRegression(multi_class='ovr', random_state=1, solver='lbfgs')
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
X_val_pca = pca.transform(X_val_std)
model.fit(X_train_pca, y_train)
y_train_pred = model.predict(X_train_pca)
y_test_pred = model.predict(X_test_pca)
Forest_train = accuracy_score(y_train, y_train_pred)
Forest_test = accuracy_score(y_test, y_test_pred)
print(f'Forest train/test accuracies '
      f'{Forest_train:.3f}/{Forest_test:.3f}')
'''

## '''

#Valutation features importances with RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
feat_labels = train_data[['Pclass','Sex','Age','SibSp','Parch','Fare', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare']]
forest = RandomForestClassifier(n_estimators = 500, random_state=1)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature importances:")
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f+1, 30, X_train.columns[indices[f]],
                           importances[indices[f]]))

plt.title('Feature importance')
plt.bar(range(X_train.shape[1]), importances[indices], align = 'center')
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.xlim([-1,X_train.shape[1]])
plt.tight_layout()
plt.show()
 
X_train_select = X_train.drop(columns = ['SibSp','Parch','Title_Miss','Title_Mrs','Title_Rare','Pclass'])
#X_train_select = X_train
X_test_select = X_test.drop(columns = ['SibSp','Parch','Title_Miss','Title_Mrs','Title_Rare','Pclass'])
#X_test_select = X_test
X_val = test_data.drop(columns = ['SibSp','Parch','PassengerId', 'Title_Miss','Title_Mrs','Title_Rare','Pclass'])
#X_val = test_data.drop(columns = ['PassengerId'])

import xgboost as xgb
from sklearn.metrics import accuracy_score
model = xgb.XGBClassifier(n_estimators = 2000, learning_rate = 0.001,
                         max_depth = 4, random_state = 1,
                         use_label_encoder = False)

gbm = model.fit(X_train_select, y_train)
y_train_pred = gbm.predict(X_train_select)
y_test_pred = gbm.predict(X_test_select)

gbm_train = accuracy_score(y_train, y_train_pred)
gbm_test = accuracy_score(y_test, y_test_pred)
print(f'XGBoost train/test accuracies '
      f'{gbm_train:.3f}/{gbm_test:.3f}')

import seaborn as sns
new_df = X_train_select.copy()
new_df['Survived']=y_train
new_df['Predictions']=y_train_pred
sns.barplot(x='Pclass', y='Predictions', hue = 'Sex',data=new_df)
plt.show()
sns.barplot(x='Pclass', y='Survived', hue = 'Sex',data=new_df)
plt.show()

'''

In [None]:
'''
predictions = model.predict(X_val_pca)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
'''

In [None]:
#Algoritmo della divisione in classi sociali
'''
#Confronto per classi
# Filtrare il dataset per classi
first_second_class = train_data[(train_data['Pclass'] == 1) | (train_data['Pclass']==2)]
#second_class = train_data[train_data['Pclass'] == 2]
third_class = train_data[train_data['Pclass'] == 3]

import seaborn as sns
import matplotlib.pyplot as plt

# Barplot del tasso di sopravvivenza per classe
sns.barplot(x='Pclass', y='Survived', hue = 'Sex', data=train_data)
plt.title('Tasso di Sopravvivenza per Classe e Sesso')
plt.xlabel('Classe')
plt.ylabel('Tasso di Sopravvivenza')
plt.show()

# Boxplot dell'età per classe e sopravvivenza
sns.boxplot(x='Pclass', y='Age', hue='Survived', data=train_data)
plt.title('Distribuzione dell\'età per Classe e Sopravvivenza')
plt.xlabel('Classe')
plt.ylabel('Età')
plt.show()

# Prima e seconda classe combinate
first_and_second_class = train_data[train_data['Pclass'].isin([1, 2])]
third_class = train_data[train_data['Pclass'] == 3]

# Confronto del tasso di sopravvivenza
print(f"Sopravvivenza media - Prima e Seconda Classe: {first_and_second_class['Survived'].mean():.3f}")
print(f"Sopravvivenza media - Terza Classe: {third_class['Survived'].mean():.3f}")

# Boxplot del prezzo del biglietto per classe
sns.boxplot(x='Pclass', y='Fare', data=train_data)
plt.title('Prezzo del Biglietto per Classe')
plt.xlabel('Classe')
plt.ylabel('Prezzo del Biglietto')
plt.show()

first_second_class_tr_data=first_second_class.drop(['Name','Cabin','Embarked','Ticket'],axis=1)
third_class_tr_data=third_class.drop(['Name','Cabin','Embarked','Ticket'],axis=1)
test_data=test_data.drop(['Name','Cabin','Embarked','Ticket'],axis=1)
first_second_class_test = test_data[(test_data['Pclass'] == 1) | (test_data['Pclass']==2)]
third_class_test = test_data[test_data['Pclass'] == 3]

test_data_sorted = test_data.sort_values(by='PassengerId')

# Supponendo di avere i test_data separati in classi
first_second_class_test_sorted = first_second_class_test.sort_values(by='PassengerId')
third_class_test_sorted = third_class_test.sort_values(by='PassengerId')

# Rimettere insieme i test set
combined_test_data = pd.concat([first_second_class_test_sorted,  
                                third_class_test_sorted])


X_val_first_second_class = first_second_class_test_sorted.drop(columns = ['PassengerId'])
X_val_third_class = third_class_test_sorted.drop(columns = ['PassengerId'])
X_first_second_class = first_second_class_tr_data.drop(columns=['Survived','PassengerId'])
X_third_class = third_class_tr_data.drop(columns=['Survived','PassengerId'])
from sklearn.model_selection import train_test_split
y_first_second_class = first_second_class_tr_data['Survived']
y_third_class = third_class_tr_data['Survived']
X_f_s_train, X_f_s_test, y_f_s_train, y_f_s_test = train_test_split(X_first_second_class, y_first_second_class, test_size=0.2, stratify=y_first_second_class, random_state=42)
X_t_train, X_t_test, y_t_train, y_t_test =  train_test_split(X_third_class,y_third_class, test_size=0.2, stratify=y_third_class, random_state=42)

#Applichiamo PCA e Random Forest Classifier alla prima e seconda clase
#Selection Features through PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

#Stardadize the features
sc_f_s = StandardScaler()
X_train_f_s_std = sc_f_s.fit_transform(X_f_s_train)
X_test_f_s_std = sc_f_s.transform(X_f_s_test)
X_val_f_s_std = sc_f_s.transform(X_val_first_second_class)

from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import accuracy_score

lda_f_s = LDA(n_components=1)
model_f_s = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1, class_weight='balanced')
X_train_f_s_lda = lda_f_s.fit_transform(X_train_f_s_std, y_f_s_train)
X_test_f_s_lda = lda_f_s.transform(X_test_f_s_std)
X_val_f_s_lda = lda_f_s.transform(X_val_f_s_std)
model_f_s.fit(X_train_f_s_lda, y_f_s_train)
y_train_f_s_pred = model_f_s.predict(X_train_f_s_lda)
y_test_f_s_pred = model_f_s.predict(X_test_f_s_lda)
Forest_train_f_s = accuracy_score(y_f_s_train, y_train_f_s_pred)
Forest_test_f_s = accuracy_score(y_f_s_test, y_test_f_s_pred)
print(f'Forest train First and Second Class/test accuracies '
      f'{Forest_train_f_s:.3f}/{Forest_test_f_s:.3f}')

predictions_f_s = model_f_s.predict(X_val_f_s_lda)


#Applichiamo PCA e Random Forest Classifier alla terza clase
#Selection Features through PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

#Stardadize the features
sc_t = StandardScaler()
X_train_t_std = sc_t.fit_transform(X_t_train)
X_test_t_std = sc_t.transform(X_t_test)
X_val_t_std = sc_t.transform(X_val_third_class)

from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import accuracy_score

lda_t = LDA(n_components=1)
model_t = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1, class_weight='balanced')
X_train_t_lda = lda_t.fit_transform(X_train_t_std, y_t_train)
X_test_t_lda = lda_t.transform(X_test_t_std)
X_val_t_lda = lda_t.transform(X_val_t_std)
model_t.fit(X_train_t_lda, y_t_train)
y_train_t_pred = model_t.predict(X_train_t_lda)
y_test_t_pred = model_t.predict(X_test_t_lda)
Forest_train_t = accuracy_score(y_t_train, y_train_t_pred)
Forest_test_t = accuracy_score(y_t_test, y_test_t_pred)
print(f'Forest train third class/test accuracies '
      f'{Forest_train_t:.3f}/{Forest_test_t:.3f}')

predictions_t = model_t.predict(X_val_t_lda)

# Combinare le predizioni per ogni classe nello stesso ordine
y_pred_combined = np.concatenate([y_train_f_s_pred, y_train_t_pred])
y_test_pred_combined = np.concatenate([predictions_f_s, predictions_t])
y_train = np.concatenate([y_f_s_train, y_t_train])

# Rimettere insieme i dati
combined_test_data['Survived'] = y_test_pred_combined
combined_test_data_sorted = combined_test_data.sort_values(by='PassengerId')

# Calcolare l'accuratezza globale
global_accuracy = accuracy_score(y_train, y_pred_combined)
print(f'Accuratezza Globale: {global_accuracy:.3f}')

submission = combined_test_data_sorted[['PassengerId', 'Survived']].sort_values(by='PassengerId')
submission.to_csv('titanic_submission.csv', index=False)
#output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
#output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

print(submission['PassengerId'].head(10))  # Controlla i primi 10 PassengerId
print(test_data['PassengerId'].head(10))   # Confronta con il test_data originale

'''