In [1]:
# NumPy
import numpy as np # type: ignore

# Dataframe operations
import pandas as pd # type: ignore

# Data visualization
import seaborn as sns # type: ignore
import matplotlib.pyplot as plt # type: ignore

# Scalers
from sklearn.preprocessing import MinMaxScaler # type: ignore
from sklearn.preprocessing import StandardScaler # type: ignore
from sklearn.utils import shuffle # type: ignore

# Models
from sklearn.linear_model import LogisticRegression #logistic regression # type: ignore
from sklearn.linear_model import Perceptron # type: ignore
from sklearn import svm #support vector Machine # type: ignore
from sklearn.ensemble import RandomForestClassifier #Random Forest # type: ignore
from sklearn.neighbors import KNeighborsClassifier #KNN # type: ignore
from sklearn.naive_bayes import GaussianNB #Naive bayes # type: ignore
from sklearn.tree import DecisionTreeClassifier #Decision Tree # type: ignore
from sklearn.model_selection import train_test_split #training and testing data split# type: ignore
from sklearn import metrics #accuracy measure # type: ignore
from sklearn.metrics import confusion_matrix #for confusion matrix # type: ignore
from sklearn.ensemble import VotingClassifier # type: ignore
from sklearn.ensemble import AdaBoostClassifier # type: ignore
from sklearn.neural_network import MLPClassifier # type: ignore

# Cross-validation
from sklearn.model_selection import KFold #for K-fold cross validation # type: ignore
from sklearn.model_selection import cross_val_score #score evaluation # type: ignore
from sklearn.model_selection import cross_val_predict #prediction # type: ignore
from sklearn.model_selection import cross_validate # type: ignore

# GridSearchCV
from sklearn.model_selection import GridSearchCV # type: ignore

#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process # type: ignore

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder # type: ignore
from sklearn import feature_selection # type: ignore
from sklearn import model_selection # type: ignore
from sklearn import metrics # type: ignore

#Visualization
import matplotlib as mpl # type: ignore
import matplotlib.pyplot as plt # type: ignore
import matplotlib.pylab as pylab # type: ignore
import seaborn as sns # type: ignore
from pandas.plotting import scatter_matrix # type: ignore

from sklearn.preprocessing import LabelEncoder # type: ignore
from sklearn.preprocessing import MinMaxScaler # type: ignore
from sklearn.linear_model import LogisticRegression # type: ignore
from sklearn.metrics import accuracy_score # type: ignore
from sklearn.model_selection import train_test_split # type: ignore

from sklearn.impute import SimpleImputer # type: ignore
from sklearn.svm import SVC # type: ignore
import os # type: ignore

ModuleNotFoundError: No module named 'numpy'

In [None]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")
data_df = pd.concat([train_df, test_df], ignore_index=True)
data_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [None]:
import pandas as pd

# Assuming data_df is already defined and loaded

# Extract Title from Name
data_df['Title'] = data_df['Name'].str.extract('([A-Za-z]+)\.', expand=True)

# Map rare titles to more common ones
mapping = {
    'Mlle': 'Miss', 'Mme': 'Mrs', 'Ms': 'Miss', 'Dr': 'Dr',
    'Major': 'Mr', 'Lady': 'Mrs', 'Sir': 'Mr', 'Col': 'Mr',
    'Capt': 'Mr', 'Countess': 'Mrs', 'Jonkheer': 'Mr',
    'Dona': 'Mrs', 'Don': 'Mr', 'Rev': 'Rev', 'Master': 'Master',
    'Miss': 'Miss', 'Mr': 'Mr', 'Mrs': 'Mrs'
}
data_df['Title'] = data_df['Title'].map(mapping)

# Fill missing Age based on Title median
age_by_title = data_df.groupby('Title')['Age'].median()
data_df['Age'] = data_df.apply(lambda row: age_by_title[row['Title']] if pd.isnull(row['Age']) else row['Age'], axis=1)

# Create copies of the slices to avoid SettingWithCopyWarning
train_df = data_df.loc[:890].copy()
test_df = data_df.loc[891:].copy()

# Calculate Family_Size and re-split data_df using .loc to avoid SettingWithCopyWarning
data_df['Family_Size'] = data_df['Parch'] + data_df['SibSp']
train_df['Family_Size'] = train_df['Parch'] + train_df['SibSp']
test_df['Family_Size'] = test_df['Parch'] + test_df['SibSp']

data_df.drop('Title', axis=1, inplace=True)
train_df.drop('Title', axis=1, inplace=True)
test_df.drop('Title', axis=1, inplace=True)

# Check for missing values in the dataset
missing_values = data_df.isnull().sum()
print(missing_values)


PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
Family_Size       0
dtype: int64


In [None]:
data_df['Last_Name'] = data_df['Name'].apply(lambda x: str.split(x, ",")[0])
data_df['Fare'] = data_df['Fare'].fillna(data_df['Fare'].median())

DEFAULT_SURVIVAL_VALUE = 0.25
data_df['Family_Survival'] = DEFAULT_SURVIVAL_VALUE

for grp, grp_df in data_df[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
    
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0

tolerance = 1e-10
filtered_data_df = data_df[abs(data_df['Family_Survival'] - 0.25) > tolerance]
print("Number of passengers with family survival information:", filtered_data_df.shape[0])


Number of passengers with family survival information: 420


In [None]:
for _, grp_df in data_df.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (np.isclose(row['Family_Survival'], 0.25, rtol=1e-09, atol=1e-09)):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0
                        
# Correcting the line to avoid the ValueError
print("Number of passengers with family/group survival information:", 
      data_df.loc[~np.isclose(data_df['Family_Survival'], 0.25, rtol=1e-09, atol=1e-09)].shape[0])

# Assuming data_df is the combined DataFrame of TRAIN_DF and TEST_DF before the split

# When initially splitting data_df into train_df and test_df, use .copy()
train_df = data_df.iloc[:891].copy()
test_df = data_df.iloc[891:].copy()

# After ensuring train_df and test_df are copies, you can then safely use .loc to modify these DataFrames
train_indices = train_df.index
test_indices = test_df.index

train_df.loc[train_indices, 'Family_Survival'] = data_df.loc[train_indices, 'Family_Survival']
test_df.loc[test_indices, 'Family_Survival'] = data_df.loc[test_indices, 'Family_Survival']



Number of passengers with family/group survival information: 546


In [None]:
# Use the assignment directly instead of inplace=True
data_df['Fare'] = data_df['Fare'].fillna(data_df['Fare'].median())

# Making Bins
data_df['FareBin'] = pd.qcut(data_df['Fare'], 5)

label = LabelEncoder()
data_df['FareBin_Code'] = label.fit_transform(data_df['FareBin'])

# Create copies of the slices to avoid SettingWithCopyWarning
train_df = data_df.loc[:890].copy()
test_df = data_df.loc[891:].copy()

# Assign the 'FareBin_Code' values back to train_df and test_df
train_df['FareBin_Code'] = train_df['FareBin_Code']
test_df['FareBin_Code'] = test_df['FareBin_Code']

# Drop the original 'Fare' column as it's now represented by 'FareBin_Code'
train_df.drop(columns=['Fare'], inplace=True)
test_df.drop(columns=['Fare'], inplace=True)
# Drop the 'FareBin' column as it's now represented by 'FareBin_Code'
train_df.drop(columns=['FareBin'], inplace=True)
test_df.drop(columns=['FareBin'], inplace=True)

# Display the first few rows of the updated training DataFrame to verify the changes
print(train_df.head())


   PassengerId  Survived  Pclass  \
0            1       0.0       3   
1            2       1.0       1   
2            3       1.0       3   
3            4       1.0       1   
4            5       0.0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket Cabin Embarked  Family_Size  Last_Name  \
0      0         A/5 21171   NaN        S            1     Braund   
1      0          PC 17599   C85        C            1    Cumings   
2      0  STON/O2. 3101282   NaN        S            0  Heikkinen   
3      0            113803  C123        S   

In [None]:
data_df['AgeBin'] = pd.qcut(data_df['Age'], 4)

label = LabelEncoder()
data_df['AgeBin_Code'] = label.fit_transform(data_df['AgeBin'])

train_df['AgeBin_Code'] = data_df['AgeBin_Code'][:891]
test_df['AgeBin_Code'] = data_df['AgeBin_Code'][891:]

train_df.drop(columns=['Age'], inplace=True)
test_df.drop(columns=['Age'], inplace=True)

In [None]:
test_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Cabin,Embarked,Family_Size,Last_Name,Family_Survival,FareBin_Code,AgeBin_Code
891,892,,3,"Kelly, Mr. James",male,0,0,330911,,Q,0,Kelly,0.25,0,2
892,893,,3,"Wilkes, Mrs. James (Ellen Needs)",female,1,0,363272,,S,1,Wilkes,0.25,0,3
893,894,,2,"Myles, Mr. Thomas Francis",male,0,0,240276,,Q,0,Myles,0.25,1,3
894,895,,3,"Wirz, Mr. Albert",male,0,0,315154,,S,0,Wirz,0.25,1,1
895,896,,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,1,1,3101298,,S,2,Hirvonen,1.0,2,0


In [None]:
import pandas as pd

# Assuming train_df, test_df, and data_df have been previously defined

# Replace 'male' and 'female' with 1 and 0, respectively
# Use the replace method and explicitly handle the downcasting behavior
train_df['Sex'] = train_df['Sex'].replace(['male', 'female'], [0, 1]).astype(int)

test_df['Sex'] = test_df['Sex'].replace(['male', 'female'], [0, 1]).astype(int)

train_df.dropna(subset=['Sex'], inplace=True)
test_df.dropna(subset=['Sex'], inplace=True)

# Drop unnecessary columns from both datasets
columns_to_drop = ['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Last_Name']
train_df.drop(columns=columns_to_drop, inplace=True)
test_df.drop(columns=columns_to_drop, inplace=True)
data_df.drop(columns=columns_to_drop, inplace=True)

# Display the first few rows of the training dataframe
print(train_df.head(3))
print(test_df.head(3))


   PassengerId  Survived  Pclass  Sex  Family_Size  Family_Survival  \
0            1       0.0       3    0            1             0.25   
1            2       1.0       1    1            1             0.25   
2            3       1.0       3    1            0             0.25   

   FareBin_Code  AgeBin_Code  
0             0            0  
1             4            3  
2             1            1  
     PassengerId  Survived  Pclass  Sex  Family_Size  Family_Survival  \
891          892       NaN       3    0            0             0.25   
892          893       NaN       3    1            1             0.25   
893          894       NaN       2    0            0             0.25   

     FareBin_Code  AgeBin_Code  
891             0            2  
892             0            3  
893             1            3  


  train_df['Sex'] = train_df['Sex'].replace(['male', 'female'], [0, 1]).astype(int)
  test_df['Sex'] = test_df['Sex'].replace(['male', 'female'], [0, 1]).astype(int)


In [None]:
# drop survived from X
X = train_df.drop('Survived', axis=1)
# y has to have only 'PassengerId', 'Survived' columns
y = train_df['Survived'].astype(int)
X_test = test_df.copy().drop('Survived', axis=1)
print("X.shape", X.shape,"y.shape", y.shape, "X_test.shape", X_test.shape)

X.shape (891, 7) y.shape (891,) X_test.shape (418, 7)


In [None]:
print("X", X.head())
print("X_test", X_test.head())


X    PassengerId  Pclass  Sex  Family_Size  Family_Survival  FareBin_Code  \
0            1       3    0            1             0.25             0   
1            2       1    1            1             0.25             4   
2            3       3    1            0             0.25             1   
3            4       1    1            1             0.00             4   
4            5       3    0            0             0.25             1   

   AgeBin_Code  
0            0  
1            3  
2            1  
3            2  
4            2  
X_test      PassengerId  Pclass  Sex  Family_Size  Family_Survival  FareBin_Code  \
891          892       3    0            0             0.25             0   
892          893       3    1            1             0.25             0   
893          894       2    0            0             0.25             1   
894          895       3    0            0             0.25             1   
895          896       3    1            2           

In [None]:
# Initialize StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Defining models for the voting classifier
knn_models = [
    ('knn1', KNeighborsClassifier(n_neighbors=4, leaf_size=1, weights='uniform')),
    ('knn2', KNeighborsClassifier(algorithm='auto', leaf_size=26, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=18, p=2,
           weights='uniform')),
    ('knn3', KNeighborsClassifier(algorithm='auto', leaf_size=26, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=18, p=2, weights='uniform')),
]
random_forest = ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
svc = ('svc', SVC(probability=True, kernel='linear'))

# Combining all models into a voting classifier
models = knn_models + [random_forest, svc]
voting_clf = VotingClassifier(estimators=models, voting='soft')

# Fit the voting classifier on scaled data
voting_clf.fit(X_scaled, y)

# Predict using the ensemble of models
y_pred = voting_clf.predict(X_test_scaled)

# Prepare the submission dataframe
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred
})

# Ensure the output directory exists
output_dir = '../data'
os.makedirs(output_dir, exist_ok=True)

# Save the submission file
submission.to_csv(f'{output_dir}/gender_submission.csv', index=False)


In [None]:
n_neighbors = [6,7,8,9,10,11,12,14,16,18,20,22]
algorithm = ['auto']
weights = ['uniform', 'distance']
leaf_size = list(range(1,50,5))
hyperparams = {'algorithm': algorithm, 'weights': weights, 'leaf_size': leaf_size, 
               'n_neighbors': n_neighbors}
gd=GridSearchCV(estimator = KNeighborsClassifier(), param_grid = hyperparams, verbose=True, 
                cv=10, scoring = "roc_auc")
gd.fit(X, y)
print(gd.best_score_)
print(gd.best_estimator_)

Fitting 10 folds for each of 240 candidates, totalling 2400 fits
0.4429411764705883
KNeighborsClassifier(leaf_size=11, n_neighbors=7)
