# First Principles Thinking

* Step 1: Identify any assumptions that you may have (batteries are $600/KwH - they're too expensive)
* Step 2: Breakdown the issues (the fundamentals - what materials go into a battery? Can I find them for cheap?)
* Step 3: Create new solutions (use your fundamental analysis and knowledge to create a solution)

https://www.youtube.com/watch?v=NV3sBlRgzTI

```
bins=[0, 13, 19, 61, sys.maxsize]
labels=['<12', 'Teen', 'Adult', 'Older']
(
  pd.read_csv('data/train.csv')
    .pipe(replace_age_na, pclass_age_map)
    .query('Embarked == "S"')
    .assign(ageGroup = lambda df: pd.cut(df['Age'], bins=bins, labels=labels))
    .pivot_table(
        values='Survived', 
        columns='Pclass', 
        index='ageGroup', 
        aggfunc='mean')
    .rename_axis('', axis='columns')
    .rename('Class {}'.format, axis='columns')
    .style.format('{:.2%}')
)

```



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd '/content/drive/My Drive/Kaggle'

/content/drive/My Drive/Kaggle


Unnamed: 0,passenger_id,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,survived
0,1216,3,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q,13.0,,,1
1,699,3,"Cacic, Mr. Luka",male,38.0,0,0,315089,8.6625,,S,,,Croatia,0
2,1267,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...",female,30.0,1,1,345773,24.15,,S,,,,0
3,449,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54.0,1,3,29105,23.0,,S,4.0,,"Cornwall / Akron, OH",1
4,576,2,"Veal, Mr. James",male,40.0,0,0,28221,13.0,,S,,,"Barre, Co Washington, VT",0


In [5]:
# EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score

# Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Metrics
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

df = pd.read_csv('titanic_train.csv')
df.dtypes

passenger_id      int64
pclass            int64
name             object
sex              object
age             float64
sibsp             int64
parch             int64
ticket           object
fare            float64
cabin            object
embarked         object
boat             object
body            float64
home.dest        object
survived          int64
dtype: object

In [6]:
df.isna().sum()

passenger_id      0
pclass            0
name              0
sex               0
age             174
sibsp             0
parch             0
ticket            0
fare              1
cabin           659
embarked          1
boat            542
body            777
home.dest       386
survived          0
dtype: int64

In [7]:
len(df)

850

In [8]:
df.drop(['cabin','boat','body','home.dest','passenger_id','name','ticket'], 
        axis=1,
        inplace=True)

df.isna().sum()

pclass        0
sex           0
age         174
sibsp         0
parch         0
fare          1
embarked      1
survived      0
dtype: int64

In [9]:
df.dtypes

pclass        int64
sex          object
age         float64
sibsp         int64
parch         int64
fare        float64
embarked     object
survived      int64
dtype: object

In [17]:
# Pipeline - SimpleImputer, OneHotEncoder, ColumnTransformer

# Categorical
categorical_columns = ['embarked'] # Fill with SimpleImputer -- Make 'S' the fill value
categorical_section = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='constant', fill_value='S')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

# Average
average_value = ['age','fare'] # Fill with SimpleImputer -- Mean of all the age.
average_section = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean'))
])

# ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
                     ('cat', categorical_section, categorical_columns),
                     ('avg', average_section, average_value) 
])

# Creating the model
model = Pipeline(steps=[
                        ('preprocessor', preprocessor),
                        ('model',RandomForestClassifier())
])

# Split the data
X = df.drop('survived', axis=1)
y = df.survived

# Seed and split into train & test
np.random.seed(42)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

# Fit and Score
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.6647058823529411