# Importing Libraries

In [137]:
!pip install xgboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
import copy
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import mean_squared_error as mse
from utility import *
from sklearn.model_selection import cross_val_score

## Importing the Data
1. TrainData - Original
2. TestData - testDf

In [139]:
original = pd.read_csv("./titanic/train.csv")
testDf = pd.read_csv("./titanic/test.csv")
test = copy.deepcopy(testDf)
df = copy.deepcopy(original)

## Exploratory Data Analysis
Types of Data:
1. Continuous Data
   1. Age
   2. Fare
2. Discrete Data
   1. Survived
   2. SibSp
   3. Parch
   4. PassengerId
3. Categorical Data
   1. Name
   2. Pclass
   3. Cabin
   4. Embarked
   5. Sex
   6. Ticket

![image.png](attachment:330bac07-c0a3-4fb3-a616-b64a7d6617eb.png)
![image.png](attachment:35db6205-e342-429d-8b0e-53b2c388c43c.png)
![image.png](attachment:c8eb2c98-3a62-4045-b958-318db589c437.png)

## Handling Missing Values
1. Cabin
2. Embarked
3. Age

In [140]:
# Processing the Data to remove columns with >= 50% missing values and removing rows forming <= 5% missing values.
# Dropping highly variable columns of Ticket, Name, PassengerId
# Factorizing the remaining categorical data - LabelEncoder
# This data point in Fare was missing, instead of making a whole model to impute the value, I replaced it with mode
test.loc[test['Fare'].isna(), 'Fare'] = 7.812
df = process_data(df)
# Using Linear Regression to impute Ages.
X_train, Y_train, X_test = create_splits(df.drop('Survived', axis = 1), "Age")
# Better if we dont exclude Survived, but had to do for testData
model = train_model(X_train, Y_train)
df = impute_age(df, "Age", X_test, model)
test = process_data(test)
X_train, Y_train, X_test = create_splits(test, "Age")
test = impute_age(test, "Age", X_test, model)


## Handling Duplicates

In [141]:
df = remove_duplicates(df)

## Outlier Detection
1. Univariate Outliers
2. Multivariate Outliers

In [142]:
# SibSp and Parch are discrete, so we can also cap them or remove them to certain level.. I am using Percentiles to cap according to my domain knowledge
# The Data is highly skewed, therefore, IQR method will not work on it. As we cannot numericalize it. So we will go with Percentiles
cols = ['SibSp', "Parch"]
print('SibSp Value Count: ', df['SibSp'].value_counts())
print('Parch Value Count: ', df['Parch'].value_counts())
box_plot(df, cols)
dist_plot(df, cols)

In [143]:
# SibSp and Parch are discrete, so we can also cap them or remove them to certain level.. I am using Percentiles to cap according to my domain knowledge
# The Data is highly skewed, therefore, IQR method will not work on it. As we cannot numericalize it. So we will go with Percentiles
cols = ['SibSp', "Parch"]
print('SibSp Value Count: ', df['SibSp'].value_counts())
print('Parch Value Count: ', df['Parch'].value_counts())
box_plot(df, cols)
dist_plot(df, cols)

In [144]:
# According to me, the SibSp can be capped at 97.5 percentile
upper_limit = np.percentile(df['SibSp'], 97.5)
print("Upper Limit for SibSp: ", upper_limit)
valuesToBeCapped = np.where(df["SibSp"] > upper_limit)[0]
df.loc[valuesToBeCapped, "SibSp"] = 4

In [145]:
# Same thing for Parch
# According to my knowledge, best answer was 4 after deeply analyzing the data and seeing the graphs
upper_limit = np.percentile(df['Parch'], 98.5)
print("Upper Limit for Parch: ", upper_limit)
valuesToBeCapped = np.where(df["Parch"]>upper_limit)[0]
df.loc[valuesToBeCapped, "Parch"] = 4


### Continuous Data

In [146]:
# I created another feature to get better idea, and applied log to normalize the data as the data for fare was also skewed, in order to perform IQR
# I transformed into log data.
df['Fare Per Person'] = df["Fare"]/(df["SibSp"]+df['Parch'] + 1)
df = transform_log(df, 'Fare Per Person')
col = ['Log_transformed_Fare Per Person']
box_plot(df, col)
dist_plot(df, col)
col = 'Log_transformed_Fare Per Person'
lower_limit, upper_limit = calc_iqr(df, col)
data = copy.deepcopy(df)
data = data[(data[col] < upper_limit)]
# 4 is the most suitable lowerLimit after analyzing the data
data = data[data['Fare'] >= 4]
data.reset_index(drop = True, inplace = True)
data.head()

In [147]:
col = ['Age']
box_plot(data, col)
stanD = data['Age'].std()
meanD = data['Age'].mean()
cappedValue = meanD + 3*stanD
lower_limit, upper_limit = calc_iqr(data, "Age")
print("Upper Limit for Age:", upper_limit)
valuesToBeCapped = np.where(data[col] > 65)[0]
data.loc[valuesToBeCapped, "Age"] = cappedValue

## We are done with univariate, now its time for multivariate Outliers.
I have used Isolation Forest, but have to preprocess as it only accepts numerical data. I have not used other techniques such as Mahalanobis Distance because it requires normal distribution data

In [148]:
# One Hot Encoding for Gender and Cities
new_data = copy.deepcopy(data)
new_data
new_data = pd.get_dummies(new_data, columns= ["Embarked", "Sex"])
new_data.head()

In [149]:
# Converting into Normal Data. And removing Log data as I dont want them to be part
new_data = new_data.replace({True: 1, False: 0})
new_data['Pclass'] = new_data['Pclass'].replace({3: 1, 1:3})
cols = [col for col in new_data.columns if "Log" in col]
print(cols)
new_data.drop(cols, axis = 1, inplace = True)
new_data.head()

In [150]:
# Multivariate Outliers
anomaly_data = copy.deepcopy(new_data)
model = IsolationForest(n_estimators= 100, contamination=0.015, random_state=42)
model.fit(anomaly_data)
new_data['Anomaly Score'] = model.decision_function(anomaly_data)
new_data["Anomaly"] = model.predict(anomaly_data)
new_data.head()

In [151]:
# Removing those outliers as they are multivariate and it will be difficult to adjust them again and reversing the one hot encoding
# as it is not needed now.
new_data = new_data[new_data["Anomaly"] != -1]
new_data.reset_index(drop = True, inplace= True)
cols = ["Sex", "Embarked"]
reverse_one_hot(new_data, cols)
new_data.head()

In [152]:
# Removing them
cols = ['Anomaly Score', "Anomaly"]
new_data.drop(cols, inplace = True, axis = 1)
new_data.head()

## Feature Analysis Feature Selection
I am going to use Partial, Complete Corelation and Mutual Information to find the most effective features for my dataset


In [153]:
new_data = transform_log(new_data, "Fare Per Person")
analyzed_data = copy.deepcopy(new_data)
pcCorr = compute_partial_relation(analyzed_data, "Survived")
print(analyzed_data.dtypes)

In [154]:
# Using MI - Discretizing as MI works good on discrete data
cols = ["Fare", "Log_transformed_Fare Per Person", "Age"]
discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
analyzed_data[cols] = discretizer.fit_transform(analyzed_data[cols])
mi_scores = mutual_info_classif(analyzed_data.drop("Survived", axis = 1), analyzed_data["Survived"], random_state=42)
mi_df = pd.DataFrame({'Feature': analyzed_data.drop("Survived", axis = 1).columns, 'MI Score': mi_scores})
print(mi_df)

In [155]:
corrFull = new_data.corr()
plt.figure(figsize=(8,6))
sns.heatmap(corrFull, annot=True, cmap="coolwarm", center=0, linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

### Notes
I have removed Fare Per Person and SibSp for better results and added Log Fare Per Person along with Total People that accounts for SibSp.
I am going to remove SibSp, and Fare per person as SibSp p value was against it and i think log will perform better as its results were more statistically significant

In [156]:
new_data['Total People'] = new_data["Parch"] + new_data["SibSp"] + 1
test['Total People'] = test['Parch'] + test['SibSp'] + 1
test['Fare Per Person'] = test["Fare"]/(test["SibSp"]+test['Parch'] + 1)
test = transform_log(test, 'Fare Per Person')
new_data.drop(inplace=True, axis = 0, columns = ['Fare Per Person', 'SibSp'])

In [157]:
new_data.head()

## Final Touches

In [158]:
# Encoding
cols = ["Sex", "Embarked"]
encoded_data = pd.get_dummies(data = new_data, columns = cols)
encoded_data = encoded_data.replace({True: 1, False: 0})
encoded_data = encoded_data[sorted(encoded_data)]
encoded_data.head()

In [159]:
X_train.columns.size

In [160]:
X_train = encoded_data.drop("Survived", axis = 1)
Y_train = encoded_data['Survived']
model = RandomForestClassifier(n_jobs = -1,  max_depth = 4, n_estimators=50)
model.fit(X_train, Y_train)
y_train = model.predict(X_train)
cv_scores = cross_val_score(model, X_train, Y_train, cv=5, scoring='accuracy')

In [161]:
X_train

In [162]:
print("Cross-Validation Accuracy:", np.mean(cv_scores))
print("NICE")

In [163]:
test = test.replace({-np.inf:0})
test.drop(inplace=True, axis = 0, columns = ['Fare Per Person', 'SibSp'])
test = test.replace({True: 1, False: 0})
test['Pclass'] = test['Pclass'].replace({3: 1, 1:3})
cols = ["Sex", "Embarked"]
encoded_data_test = pd.get_dummies(data = test, columns = cols)
X_test = encoded_data_test[sorted(encoded_data_test)]
X_test

In [164]:
y_test = model.predict(X_test)


In [165]:
np.savetxt('arrayFinal1.csv', y_test, delimiter=',', fmt='%d')


In [166]:
y_test = pd.DataFrame(y_test)

In [167]:
testResult = pd.read_csv('./titanic/gender_submission.csv')

In [168]:
testResult['Survived'] = y_test

In [169]:
testResult.head(10)

In [170]:
testResult.to_csv('Submission_file_nameRandom2.csv', index=False , header = 1)
#