In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV 
from sklearn.pipeline import Pipeline 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler 
from google.colab import drive 
drive.mount('/content/gDrive')


In [None]:
# Import train and test set
train_df = pd.read_csv('/content/gDrive/MyDrive/Colab Notebooks/train.csv')
test_df = pd.read_csv('/content/gDrive/MyDrive/Colab Notebooks/test.csv')

In [None]:
# View the training dataset and confirm the content
train_df.head(15)

In [None]:
# View the testing dataset and conform the content 
test_df.head(15)

In [None]:
#Get the statistics of the train dataset
train_df.describe()

In [None]:
# Get the statitsics of the test dataset 
test_df.describe()

In [None]:
# Get a count of who survived
train_df['Survived'].value_counts()

In [None]:
# Get the information of the training and testing datasets to see what I am missing
train_df.info()

In [None]:
test_df.info()

In [None]:
# Cabin impute using 'NA'
train_df['Cabin'] = train_df['Cabin'].fillna('NA') 
test_df['Cabin'] = test_df['Cabin'].fillna('NA')

In [None]:
train_df.info()

In [None]:
# Visualize the Sex info
sns.countplot(data=train_df, x='Sex').set(title='Sex')
plt.show()

In [None]:
# Visualize overall Survival info 0=Did not survive 1=Survived
sns.countplot(data = train_df, x='Survived').set(title='Survival')
plt.show()

In [None]:
# Visualize PClass
sns.countplot(data=train_df, x='Pclass').set(title='Pclass')
plt.show()

In [None]:
# Visualize the relationship between sex and those that survived
sns.countplot(data=train_df, x='Sex', hue='Survived').set(title='Sex and Survival')
plt.show()

In [None]:
# Visualize the relationship between Pclass and survival 
sns.countplot(data=train_df, x='Pclass', hue='Sex').set(title='PClass and Sex')
plt.show()

In [None]:
# Survived and Age relationship
sns.boxplot(data=train_df, x='Survived', y='Age',).set(title='Survived and Age')
plt.show()

In [None]:
# Prepare the dataset 
train_df = pd.read_csv('/content/gDrive/MyDrive/Colab Notebooks/train.csv')
X = train_df['Sex'] # input dataset
y = test_df['Pclass'] # output dataset

sns.heatmap(train_df.corr(), cmap="YlGnBu")
plt.show()

In [None]:
# Fills the missing values of age
class AgeImputer(BaseEstimator, TransformerMixin):
  def fit(self, X, y = None):
    return self
  def transform(self, X):
    imputer = SimpleImputer(strategy='mean')
    X['Age'] = imputer.fit_transform(X[['Age']])
    return X 

In [None]:
# # Create a OneHotEncoder that creates columns that are assigned binary values of 1 or 0.
# # This is so the pipeline can read everything
# class FeatureEncoder(BaseEstimator, TransformerMixin):
#   def fit(self, X, y = None):
#     return self

# # Create encoder and matrix 
#   def transform(self, X):
#     encoder = OneHotEncoder()
#     matrix = encoder.fit_transform(X[['Embarked']]).toarray()

#     # Change values into binary columns
#     column_name = ['C', 'S', 'Q', 'H'] 

#     for i in range(len(matrix.T)):
#       X[column_name[i]] = matrix.T[i]
    
#     matrix = encoder.fit_transform(X[['Sex']]).toarray()

#     column_name = ['Female', 'Male']

#     for i in range(len(matrix.T)):
#       X[column_name[i]] = matrix.T[i]

#       return X 

# class FeatureDropper(BaseEstimator, TransformerMixin):
#   def fit(self, X, y = None):
#     return self

#   def transform(self, X):
#     return X.drop(['Name', 'Ticket', 'Cabin', 'Sex'], axis = 1, errors = 'ignore')


In [None]:
# This is the actual pipeline
# pipeline = Pipeline([('ageImputer', AgeImputer()), 
#                      ('featureEncoder', FeatureEncoder()), 
#                      ('featureDropper', FeatureDropper()) ])

In [None]:
# train_df = pipeline.fit_transform(train_df)

In [None]:
# scaler = StandardScaler()
# X_data = scaler.fit_transform(X)
# y_data = y.to_numpy()

In [None]:
# Create a random forest classifier and grid to search through
# This was where the predictions and accuracy score was supposed to take place, but I did not finish.
classifier = RandomForestClassifier()

parameter_grid = [
    {'n_estimators': [10, 100, 200, 300], 'max_depth': [None, 5, 10], 'min_samples_split': [2,3, 4]}
]

# Search the grid with 3 folds
grid_search = GridSearchCV(classifier, parameter_grid, cv=3, scoring='accuracy', return_train_score = True)
grid_search