
## Import Module

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Loading the dataset

In [None]:
import os
for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.head()

In [None]:
train.describe()

In [None]:
train.info()

## Exploratory Data Analysis

In [None]:
sns.countplot(train['Survived'])

In [None]:
class_fare = train.pivot_table(index='Pclass', values='Fare', aggfunc=np.sum)
class_fare.plot(kind='bar')
plt.xlabel('Pclass')
plt.ylabel('Total Fare')
plt.xticks(rotation=0)
plt.show()

## Data Preprocessing

In [None]:
train_len = len(train)

df = pd.concat([train, test], axis=0)
df = df.reset_index(drop=True)
df.head()

In [None]:
df.tail()

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(columns=['Cabin'], axis=1)

df['Age'].mean()

In [None]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

df.isnull().sum()

In [None]:
df['Embarked'].mode()[0]

In [None]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

## Log transformation for uniform data distribution

In [None]:
sns.displot(df['Fare'])

In [None]:
df['Fare'] = np.log(df['Fare'] + 1)

# sns.displot(df['Fare'])

In [None]:
df = df.drop(columns=['Name', 'Ticket'], axis=1)
df.head()

## Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
cols = ['Sex', 'Embarked']
le = LabelEncoder()

for col in cols:
    df[col] = le.fit_transform(df[col])
df.head()

## Train Test Split

In [None]:
train = df.iloc[:train_len, :]
test = df.iloc[train_len:, :]

X = train.drop(columns=['PassengerId', 'Survived'], axis=1)
y = train['Survived']

X.head()

## Model Training

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score

def classify(model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    model.fit(X_train, y_train)
    print('Accuracy:', model.score(X_test, y_test))

    score = cross_val_score(model, X, y, cv=5)
    print('CV Score:', np.mean(score))

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
classify(model)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
classify(model)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
classify(model)

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier()
classify(model)

In [None]:
from lightgbm import LGBMClassifier
model = LGBMClassifier()
classify(model)

In [None]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(verbose=0)
classify((model))

## Complete Model Training with Full Data

In [None]:
model = LGBMClassifier()
model.fit(X, y)

test.head()

In [None]:
X_test = test.drop(columns=['PassengerId', 'Survived'], axis=1)

X_test.head()

In [None]:
pred = model.predict(X_test)
pred

## Test Submission

In [None]:
for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
     print(os.path.join(dirname, filename))

In [None]:
sub = pd.read_csv('gender_submission.csv')
sub.head()

In [None]:
sub['Survived'] = pred
sub['Survived'] = sub['Survived'].astype('int')

sub.info()

In [None]:
sub.to_csv('submission.csv', index=False)