In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

train_data = pd.read_csv('C:/Users/Gaurav/Desktop/ML/train.csv')
test_data = pd.read_csv('C:/Users/Gaurav/Desktop/ML/test.csv')

train_data[['Cabin_Category', 'Cabin_Number', 'Cabin_Other']] = train_data['Cabin'].str.split('/', expand=True)
test_data[['Cabin_Category', 'Cabin_Number', 'Cabin_Other']] = train_data['Cabin'].str.split('/', expand=True)

label_encoder = LabelEncoder()
text_to_numeric_columns = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_Category', 'Cabin_Other']

for column in text_to_numeric_columns:
    if train_data[column].dtype == 'O': 
        train_data.loc[:, column] = label_encoder.fit_transform(train_data[column])

for column in text_to_numeric_columns:
    if test_data[column].dtype == 'O': 
        test_data.loc[:, column] = label_encoder.fit_transform(test_data[column])


columns_of_interest = ['RoomService', 'FoodCourt','ShoppingMall','Spa','VRDeck']

num_std_dev = 3

for column_name in columns_of_interest:
    mean_value = train_data[column_name].mean()
    std_dev_value = train_data[column_name].std()

    lower_threshold = mean_value - num_std_dev * std_dev_value
    upper_threshold = mean_value + num_std_dev * std_dev_value

    outliers = (train_data[column_name] < lower_threshold) | (train_data[column_name] > upper_threshold)
    
    train_data = train_data[~outliers]

'''
for column_name in columns_of_interest:
    mean_value = test_data[column_name].mean()
    std_dev_value = test_data[column_name].std()

    lower_threshold = mean_value - num_std_dev * std_dev_value
    upper_threshold = mean_value + num_std_dev * std_dev_value

    outliers = (test_data[column_name] < lower_threshold) | (test_data[column_name] > upper_threshold)
    
    test_data = test_data[~outliers]
'''

train_data.loc[:, 'Age'] = train_data['Age'].fillna(train_data['Age'].median()).infer_objects(copy=False)
train_data.loc[:, 'RoomService'] = train_data['RoomService'].fillna(train_data['RoomService'].mean()).infer_objects(copy=False)
train_data.loc[:, 'FoodCourt'] = train_data['FoodCourt'].fillna(train_data['FoodCourt'].mean()).infer_objects(copy=False)
train_data.loc[:, 'ShoppingMall'] = train_data['ShoppingMall'].fillna(train_data['ShoppingMall'].mean()).infer_objects(copy=False)
train_data.loc[:, 'Spa'] = train_data['Spa'].fillna(train_data['Spa'].mean()).infer_objects(copy=False)
train_data.loc[:, 'VRDeck'] = train_data['VRDeck'].fillna(train_data['VRDeck'].mean()).infer_objects(copy=False)
train_data.loc[:, 'Cabin_Number'] = train_data['Cabin_Number'].fillna(0).infer_objects(copy=False)


test_data.loc[:, 'Age'] = test_data['Age'].fillna(test_data['Age'].median()).infer_objects(copy=False)
test_data.loc[:, 'RoomService'] = test_data['RoomService'].fillna(test_data['RoomService'].mean()).infer_objects(copy=False)
test_data.loc[:, 'FoodCourt'] = test_data['FoodCourt'].fillna(test_data['FoodCourt'].mean()).infer_objects(copy=False)
test_data.loc[:, 'ShoppingMall'] = test_data['ShoppingMall'].fillna(test_data['ShoppingMall'].mean()).infer_objects(copy=False)
test_data.loc[:, 'Spa'] = test_data['Spa'].fillna(test_data['Spa'].mean()).infer_objects(copy=False)
test_data.loc[:, 'VRDeck'] = test_data['VRDeck'].fillna(test_data['VRDeck'].mean()).infer_objects(copy=False)
test_data.loc[:, 'Cabin_Number'] = test_data['Cabin_Number'].fillna(0).infer_objects(copy=False)

pd.set_option('future.no_silent_downcasting', True)


selected_train = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_Category',
                  'Cabin_Number', 'Cabin_Other']
selected_test =  ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_Category',
                  'Cabin_Number', 'Cabin_Other']

X_train = train_data[selected_train]
Y_train = train_data['Transported']

finalTest = test_data[selected_test]

scaler = StandardScaler()

#USE THESE FOR MODELS *********************************************************************************************************************************
X_train_scaled = scaler.fit_transform(X_train)
finalTest_scaled = scaler.fit_transform(finalTest)

X_train, X_val, Y_train, Y_val = train_test_split(X_train_scaled, Y_train, test_size=0.2, random_state=42)

logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train, Y_train)

predictionLogreg = logreg_model.predict(X_val)

accuracy_val = accuracy_score(Y_val, predictionLogreg)
conf_matrix = confusion_matrix(Y_val, predictionLogreg)

print(f"*********LOGISTIC REGRESSION METRICES:**********")
print(f"Accuracy on validation set: {accuracy_val}")
print(f"Confusion Matrix on validation set:\n{conf_matrix}")
predictions = logreg_model.predict(finalTest_scaled)
print(predictions)
print(f"")

decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_model.fit(X_train, Y_train)

prediction_decision_tree = decision_tree_model.predict(X_val)

accuracy_decision_tree = accuracy_score(Y_val, prediction_decision_tree)
conf_matrix_decision_tree = confusion_matrix(Y_val, prediction_decision_tree)

print("*********DECISION TREE METRICS:**********")
print(f"Accuracy on validation set: {accuracy_decision_tree}")
print(f"Confusion Matrix on validation set:\n{conf_matrix_decision_tree}")
predictions = decision_tree_model.predict(finalTest_scaled)
print(predictions)
print("\n")



random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, Y_train)

prediction_random_forest = random_forest_model.predict(X_val)

accuracy_random_forest = accuracy_score(Y_val, prediction_random_forest)
conf_matrix_random_forest = confusion_matrix(Y_val, prediction_random_forest)

print("*********RANDOM FOREST METRICS:**********")
print(f"Accuracy on validation set: {accuracy_random_forest}")
print(f"Confusion Matrix on validation set:\n{conf_matrix_random_forest}")
predictions = random_forest_model.predict(finalTest_scaled)
print(predictions)
print("\n")


svm_model = SVC(random_state=42)
svm_model.fit(X_train, Y_train)

prediction_svm = svm_model.predict(X_val)

accuracy_svm = accuracy_score(Y_val, prediction_svm)
conf_matrix_svm = confusion_matrix(Y_val, prediction_svm)

print("*********SVM METRICS:**********")
print(f"Accuracy on validation set: {accuracy_svm}")
print(f"Confusion Matrix on validation set:\n{conf_matrix_svm}")
predictions = svm_model.predict(finalTest_scaled)
print(predictions)
print("\n")


knn_model = KNeighborsClassifier()
knn_model.fit(X_train, Y_train)

prediction_knn = knn_model.predict(X_val)

accuracy_knn = accuracy_score(Y_val, prediction_knn)
conf_matrix_knn = confusion_matrix(Y_val, prediction_knn)

print("*********KNN METRICS:**********")
print(f"Accuracy on validation set: {accuracy_knn}")
print(f"Confusion Matrix on validation set:\n{conf_matrix_knn}")
predictions = knn_model.predict(finalTest_scaled)
print(predictions)
print("\n")



*********LOGISTIC REGRESSION METRICES:**********
Accuracy on validation set: 0.7744171392564587
Confusion Matrix on validation set:
[[532 213]
 [145 697]]
[ True False  True ...  True  True  True]

*********DECISION TREE METRICS:**********
Accuracy on validation set: 0.7189666036546944
Confusion Matrix on validation set:
[[518 227]
 [219 623]]
[False  True  True ...  True False  True]


*********RANDOM FOREST METRICS:**********
Accuracy on validation set: 0.7889098928796471
Confusion Matrix on validation set:
[[610 135]
 [200 642]]
[False False  True ... False False False]


*********SVM METRICS:**********
Accuracy on validation set: 0.7838689350976685
Confusion Matrix on validation set:
[[559 186]
 [157 685]]
[ True False  True ...  True False  True]


*********KNN METRICS:**********
Accuracy on validation set: 0.7637051039697542
Confusion Matrix on validation set:
[[577 168]
 [207 635]]
[ True False  True ...  True False  True]


