### Import Packages and Load Data

In [30]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
import xgboost as xgb

In [4]:
titanic_training = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/train.csv')

In [5]:
titanic_test = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/test.csv')

### Initial Dataframe Exploration

In [6]:
titanic_training.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
titanic_training.set_index("PassengerId", inplace=True)

In [8]:
titanic_training[['Cabin1', 'Cabin2', 'Cabin3']] = titanic_training['Cabin'].\
                str.extract(r'(?P<Cabin1>[A-Z]+\d+)\s?(?P<Cabin2>[A-Z]+\d+)?\s?(?P<Cabin3>[A-Z]+\d+)?')

In [9]:
titanic_training.drop(columns=['Ticket', 'Name','Cabin'], inplace = True)

In [10]:
titanic_training['CabinLetter'] = titanic_training['Cabin1'].str.extract(r'(?P<Cabin1_Code>[A-Z]+)')

In [11]:
titanic_training['Cabin1'] = np.where(titanic_training['Cabin1'].isnull(), 0, 1)
titanic_training['Cabin2'] = np.where(titanic_training['Cabin2'].isnull(), 0, 1)
titanic_training['Cabin3'] = np.where(titanic_training['Cabin3'].isnull(), 0, 1)
titanic_training['CabinNum'] = titanic_training['Cabin1'] + titanic_training['Cabin2'] + titanic_training['Cabin3']

In [12]:
titanic_training['MultipleCabins']=np.where(titanic_training['CabinNum'] > 1, 1, 0)

In [13]:
titanic_training.drop(columns=['Cabin1', 'Cabin2','Cabin3','CabinNum'], inplace = True)

In [14]:
titanic_training.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,MultipleCabins
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,male,22.0,1,0,7.25,S,,0
2,1,1,female,38.0,1,0,71.2833,C,C,0
3,1,3,female,26.0,0,0,7.925,S,,0
4,1,1,female,35.0,1,0,53.1,S,C,0
5,0,3,male,35.0,0,0,8.05,S,,0


In [15]:
nullseries = titanic_training.isnull().sum()
nullseries[nullseries > 0]

Age            177
Embarked         2
CabinLetter    691
dtype: int64

In [16]:
nullseries = titanic_training.isnull().sum()/len(titanic_training)
nullseries[nullseries > 0]

Age            0.198653
Embarked       0.002245
CabinLetter    0.775533
dtype: float64

In [17]:
titanic_training.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,MultipleCabins
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.022447
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,0.148214
min,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104,0.0
50%,0.0,3.0,28.0,0.0,0.0,14.4542,0.0
75%,1.0,3.0,38.0,1.0,0.0,31.0,0.0
max,1.0,3.0,80.0,8.0,6.0,512.3292,1.0


In [18]:
for column in ('Sex', 'Embarked','CabinLetter','Pclass'):
    titanic_training[column] = titanic_training[column].astype('category')

In [19]:
titanic_training.dtypes

Survived             int64
Pclass            category
Sex               category
Age                float64
SibSp                int64
Parch                int64
Fare               float64
Embarked          category
CabinLetter       category
MultipleCabins       int64
dtype: object

In [42]:
X = titanic_training.drop('Survived', axis=1)
y = titanic_training['Survived']

In [43]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state = 202)

In [44]:
train_X.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,MultipleCabins
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
567,3,male,19.0,0,0,7.8958,S,,0
325,3,male,,8,2,69.55,S,,0
757,3,male,28.0,0,0,7.7958,S,,0
193,3,female,19.0,1,0,7.8542,S,,0
222,2,male,27.0,0,0,13.0,S,,0


In [32]:
from sklearn.impute import SimpleImputer

In [33]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

In [34]:
train_X_imp = imp.fit_transform(train_X)
valid_X_imp = imp.transform(valid_X)

In [27]:
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config
from sklearn.preprocessing import StandardScaler

In [50]:
train_X.dtypes

Pclass            category
Sex               category
Age                float64
SibSp                int64
Parch                int64
Fare               float64
Embarked          category
CabinLetter       category
MultipleCabins       int64
dtype: object

In [76]:
knn_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(), train_X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ,('scaler', StandardScaler())
    ,('model', KNeighborsClassifier())
    ])

In [77]:
knn_pipeline.fit(train_X, train_y)
knn_pipeline.score(valid_X, valid_y)

0.7899159663865546

In [97]:
rf_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(), train_X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ,('model', RandomForestClassifier(random_state =123))
    ])

In [79]:
rf_pipeline.fit(train_X, train_y)
rf_pipeline.score(valid_X, valid_y)

0.803921568627451

In [91]:
from sklearn.model_selection import cross_validate
scores = cross_validate(rf_pipeline, X, y, 
                        scoring=('accuracy'), cv=5)

print(scores['test_score'].mean())

0.8047266336074319


In [92]:
from sklearn.model_selection import cross_val_score

In [101]:
from sklearn.model_selection import cross_validate
scores = cross_val_score(rf_pipeline, X, y, 
                        scoring=('accuracy'), cv=5)
scores.mean()

0.8047266336074319

In [54]:
train_X.select_dtypes(include=['category']).columns

Index(['Pclass', 'Sex', 'Embarked', 'CabinLetter'], dtype='object')

In [None]:
['Pclass', 'Sex', 'Embarked', 'CabinLetter']

In [39]:
scaler = preprocessing.StandardScaler()
train_X_norm = scaler.fit_transform(train_X_imp)
valid_X_norm = scaler.transform(valid_X_imp)
knn = KNeighborsClassifier()
knn.fit(train_X_norm, train_y)
knn.score(train_X_norm, train_y)
knn.score(valid_X_norm, valid_y)

0.7899159663865546

In [None]:
pipeline = Pipeline([
    ['preprocessing', ColumnTransformer(remainder='passthrough',
        transformers=[
            ('dummies', OneHotEncoder(drop='first'), ['Category', 'currency', 'endDay']),
        ]
    )],
    ['model', LogisticRegression(penalty="l2", C=1e42, solver='liblinear')]
])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

preprocessing = ColumnTransformer(remainder='passthrough',
    transformers=[
        ('vacation', OrdinalEncoder(categories=[['No', 'Yes'], ['No', 'Yes']]), ['VACATION', 'SW']),
        ('slot', OrdinalEncoder(categories=[['Free', 'Controlled']]), ['SLOT']),
        ('gate', OrdinalEncoder(categories=[['Free', 'Constrained']]), ['GATE']),
    ]
)

model = LinearRegression(normalize=True)

pipeline = Pipeline([
    ['preprocessing', preprocessing],
    ['model', model],
])

In [None]:
scaled_pipeline = Pipeline([
    ['preprocessing', ColumnTransformer(remainder='passthrough', transformers=[
        ('vacation', OrdinalEncoder(categories=[['No', 'Yes'], ['No', 'Yes']]), ['VACATION', 'SW']),
        ('slot', OrdinalEncoder(categories=[['Free', 'Controlled']]), ['SLOT']),
        ('gate', OrdinalEncoder(categories=[['Free', 'Constrained']]), ['GATE']),
    ])],
    ['scaler', StandardScaler()],
    ['model', LinearRegression(normalize=True)],
])

In [None]:
import numpy as np
from sklearn.model_selection import cross_validate

scores = cross_validate(scaled_pipeline, X, y, 
                        scoring=('r2', 'neg_mean_absolute_error'), cv=5)

In [None]:
classifier = Pipeline([
    ['model', VotingClassifier([
        ('logit', LogisticRegression(penalty="l2", C=1e42, solver='liblinear')),
        ('dtree', DecisionTreeClassifier()),
        ('bagging', BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, max_features=0.5)),
        ('boosted', AdaBoostClassifier(n_estimators=100, base_estimator=DecisionTreeClassifier())),
        ('rf', RandomForestClassifier(max_features=3, min_samples_split=300,
                                      random_state=0, n_estimators=100, criterion='entropy')),
        ('lda', LinearDiscriminantAnalysis()),
        ('nn', Pipeline([
            ('scaler', MinMaxScaler()),
            ('nn', MLPClassifier(hidden_layer_sizes=(10), activation='logistic', solver='lbfgs', 
                                 random_state=12, max_iter=5000)),
        ]))
    ], voting='soft')]
])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
set_config(display='diagram')   

pipeline = Pipeline([
    ['preprocessing', ColumnTransformer(remainder='passthrough',
        transformers=[
            ('dummies', OneHotEncoder(drop='first'), ['column1', 'column2', 'column3']),
        ]
    )],
    ['model', LogisticRegression(penalty="l2", C=1e42, solver='liblinear')]
])

X = df.drop(columns=[outcome])
y = df[outcome]

# partition data
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

pipeline.fit(train_X, train_y)

In [None]:
scaled_pipeline = Pipeline([
    ['preprocessing', ColumnTransformer(remainder='passthrough', transformers=[
        ..... 
    ])],
    ['scaler', StandardScaler()],
    ['model', LinearRegression(normalize=True)],
])

In [51]:
xg_model = xgb.XGBClassifier(objective='binary:logistic', 
                             n_estimators=20, 
                             seed=123,
                             use_label_encoder=False,
                             verbosity=0)

In [53]:
xg_model.fit(train_X_imp, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=20, n_jobs=8, num_parallel_tree=1, random_state=123,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=123,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=0)

In [56]:
xg_model.score(train_X_imp, train_y)

0.9157303370786517

In [57]:
xg_model.score(valid_X_imp, valid_y)

0.8179271708683473

In [60]:
#accuracy_score(valid_y, xg_model.predict(valid_X))

In [67]:
data_mean, data_std = np.mean(train_X.Age), np.std(train_X.Age)
cut_off = data_std * 3
lower, upper = data_mean - cut_off, data_mean + cut_off

In [72]:
outliers = [x for x in train_X.Age if x < lower or x > upper]
print('Identified outliers: %d' % len(outliers))

Identified outliers: 2


In [73]:
outliers_removed = [x for x in train_X.Age if x >= lower and x <= upper]
print('Non-outlier observations: %d' % len(outliers_removed))

Non-outlier observations: 435


In [38]:
# Create the DMatrix from X and y: churn_dmatrix
churn_dmatrix = xgb.DMatrix(data=train_X, label=train_y)

# Create the parameter dictionary: params
params = {"objective":"reg:logistic", "max_depth":3}

# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, 
                  nfold=3, num_boost_round=5, 
                  metrics="error", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Perform cross_validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, 
                  nfold=3, num_boost_round=5, 
                  metrics="auc", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

   train-error-mean  train-error-std  test-error-mean  test-error-std
0          0.154494         0.003973         0.200374        0.021678
1          0.154494         0.004587         0.202247        0.022935
2          0.147940         0.008683         0.189138        0.011544
3          0.145131         0.010839         0.191011        0.013761
4          0.145131         0.012632         0.191011        0.013761
   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.872579       0.024674       0.838823      0.021684
1        0.892451       0.006199       0.848757      0.031334
2        0.898664       0.009607       0.862181      0.040884
3        0.904105       0.011480       0.863543      0.037934
4        0.908408       0.013190       0.867602      0.039452


In [64]:
# Create the parameter grid: gbm_param_grid 
gbm_param_grid = {
    'n_estimators': [25],
    'max_depth': range(1, 40)
}

# Instantiate the regressor: gbm
gbm = xgb.XGBClassifier(n_estimators=10)

# Perform random search: grid_mse
randomized_mse =  RandomizedSearchCV(param_distributions=gbm_param_grid, estimator=gbm,scoring='accuracy', n_iter=5, cv=4, verbose=1)


# Fit randomized_mse to the data
randomized_mse.fit(train_X,train_y)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_mse.best_params_)
print("Highest accuracy found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

Fitting 4 folds for each of 5 candidates, totalling 20 fits








Best parameters found:  {'n_estimators': 25, 'max_depth': 3}
Highest accuracy found:  0.8983909980439481




In [None]:
#1 Outliers
#2 Impute Values
#3 Dummies
#4 Model

In [26]:
rfModel = RandomForestClassifier(n_estimators=100, random_state=1)
rfModel.fit(train_X, train_y)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
categorical = titanic_training.select_dtypes(include=['object']).columns
numeric = titanic_training.select_dtypes(include=['int64', 'float64']).columns

In [None]:
sns.displot(x="CabinNum", kind ='hist', data=titanic_training)

In [None]:
#Impute the Median for missing RetailPrice rows
medianRetailPrice = data_load['RetailPrice'].median()
data_load['RetailPrice'] = data_load['RetailPrice'].fillna(value = medianRetailPrice)

In [58]:
scaler = preprocessing.StandardScaler()
train_X_norm = scaler.fit_transform(train_X_imp)
valid_X_norm = scaler.transform(valid_X_imp)

In [59]:
knn = KNeighborsClassifier()
knn.fit(train_X_norm, train_y)

KNeighborsClassifier()

In [60]:
knn.score(train_X_norm, train_y)

0.8558052434456929

In [63]:
knn.score(valid_X_norm, valid_y)

0.7899159663865546

In [None]:
knn_pred = knn.predict(valid_X_norm)
knn_proba = knn.predict_proba(valid_X_norm)

In [None]:
#titanic_training['CabinNum']=titanic_training['CabinNum'].replace(0, np.nan)

In [None]:
titanic_training['Cabin1'].str.extract(r'(?P<Cabin1_Code>[A-Z]+)').value_counts()

In [None]:
titanic_training['Cabin2'].str.extract(r'(?P<Cabin2_Code>[A-Z]+)').value_counts()

In [None]:
sns.boxplot(x='Age', y ='Sex', data=titanic_training)

In [None]:
titanic_training['Cabin3'].str.extract(r'(?P<Cabin3_Code>[A-Z]+)').value_counts()

In [None]:
titanic_training[titanic_training.Cabin2.notnull()]

In [None]:
titanic_training[titanic_training.Cabin3.notnull()]

In [None]:
#people always book under same letter code

In [None]:
titanic_training.CabinNum.value_counts()

In [None]:
titanic_training.shape

In [None]:
titanic_training.dtypes

In [None]:
titanic_training.info()

In [None]:
titanic_training.describe()

In [None]:
titanic_training.median()

In [None]:
titanic_training.groupby('Sex').mean().round(2)

In [None]:
nullseries = titanic_training.isnull().sum()/len(titanic_training)
nullseries[nullseries > 0]

- Remove Cabin lots of NULLS 
- impute AGE
- check outliers!

In [None]:
nullseries = titanic_training.isnull().sum()
nullseries[nullseries > 0]

In [None]:
titanic_training[categorical].nunique()

In [None]:
titanic_training[numeric].nunique()

In [None]:
#titanic_training['Ticket'].str.split(" ").head(50)
#titanic_training['Ticket'].str.extract('(.*)\s(.*)')

In [None]:
titanic_training[numeric].nunique()

### Data Exploration: Categorical Data

In [None]:
titanic_training[categorical]

In [None]:
sns.catplot(x="Sex", 
            data=titanic_training, 
            kind="count",
            hue="Survived")
plt.show()

In [None]:
sns.catplot(x="Embarked", 
            data=titanic_training, 
            kind="count",
            hue="Survived")
plt.show()

In [None]:
sns.catplot(x="SibSp", 
            data=titanic_training, 
            kind="count",
            hue="Survived")
plt.show()

In [None]:
sns.catplot(
    x="Sex",
    y="Age",
    data=titanic_training,
    kind = "bar")

plt.show()

In [None]:
g = sns.FacetGrid(titanic_training, col="Embarked",  row="Sex")
g.map_dataframe(sns.countplot, x="Survived")
plt.show()

In [None]:
sns.catplot(x="Survived", row="Sex", col="Embarked", kind="count", data = titanic_training)
plt.show()

In [None]:
sns.catplot(x="Survived", hue="Sex", col="Embarked", kind="count", data = titanic_training)
plt.show()

In [None]:
 sns.set_context("paper")
fig, axes = plt.subplots(2, 2, figsize=(12, 5))
fig.tight_layout(pad=2)

fig.suptitle('Categorical Exploration')

sns.countplot(ax=axes[0,0], x="Embarked",hue="Survived", data=titanic_training)
sns.countplot(ax=axes[0,1], x="Sex",hue="Survived", data=titanic_training)
sns.countplot(ax=axes[1,0], x="Parch",hue="Survived", data=titanic_training)
sns.countplot(ax=axes[1,1], x="SibSp",hue="Survived", data=titanic_training)
sns.countplot(ax=axes[2,0], x="Pclass",hue="Survived", data=titanic_training)
sns.countplot(ax=axes[2,2], x="Pclass",hue="Survived", data=titanic_training)

plt.show()

In [None]:
titanic_training[numeric].nunique()

In [None]:
titanic_training[categorical].nunique()

In [None]:
titanic_training['Cabin'][titanic_training['Cabin'].notnull()].head(10)

In [None]:
titanic_training['Cabin'].value_counts()

In [None]:
#Categorical Data Exploration
 
sns.set_context("paper")
fig, axes = plt.subplots(3, 3, figsize=(16, 10))
fig.tight_layout(pad=10)
 
fig.suptitle('Categorical Exploration', fontsize = 15)
 
rows = [0,0,0,1,1,1,2,2,2]
columns = [0,1,2,0,1,2,0,1,2]
label = []
 
for row, col, cat, lab in zip(rows, columns, categorical[1:], label):
    sns.countplot(ax=axes[row,col], x = cat, hue="", data=df)
    axes[row, col].tick_params(axis='x', labelrotation=45)
    axes[row, col].set(xlabel=lab)
 
 
fig, axes = plt.subplots(3, 2, figsize=(16, 10))
fig.tight_layout(pad=10)
 
fig.suptitle('Numeric Exploration', fontsize = 15)
 
rows = [0,0,1,1,2,2]
columns = [0,1,0,1,0,1]
label = ['Name', 'Sex', 'Ticket', 'Cabin']
 
for row, col, cat, lab in zip(rows, columns, df[[]], label):
    sns.histplot(ax=axes[row,col], x = cat, hue="", data=df)
    axes[row, col].tick_params(axis='x', labelrotation=45)
    axes[row, col].set(xlabel=lab)


In [None]:
titanic_training[numeric].nunique()

In [None]:
sns.displot(x="Age",hue="Survived", kind ='hist', data=titanic_training)

### Data Exploration: Numeric Data

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

fig.suptitle('Numeric Exploration')

sns.histplot(ax=axes[0], x="Age",hue="Survived", data=titanic_training)
sns.histplot(ax=axes[1], x="Fare",hue="Survived", data=titanic_training)

plt.show()

In [None]:
titanic_training[numeric].nunique()

### Additional Feature Engineering

### Model Creation

### Model Validation

### Generate Predictions (Inference)

### End of Workbook

In [None]:
%matplotlib inline
fig, axes = plt.subplots(3, 3, figsize=(18, 10))
  
fig.suptitle('FARE Scatterplot Exploration')
  
sns.scatterplot(ax=axes[0, 0], x= 'PAX', y= 'FARE', data=air_df)
sns.scatterplot(ax=axes[0, 1], x= 'HI', y= 'FARE', data=air_df)
sns.scatterplot(ax=axes[0, 2], x= 'NEW', y= 'FARE', data=air_df)

sns.scatterplot(ax=axes[1, 0], x= 'COUPON', y= 'FARE', data=air_df)
sns.scatterplot(ax=axes[1, 1], x= 'S_INCOME', y= 'FARE', data=air_df)
sns.scatterplot(ax=axes[1, 2], x= 'S_POP', y= 'FARE', data=air_df)

sns.scatterplot(ax=axes[2, 0], x= 'DISTANCE', y= 'FARE', data=air_df)
sns.scatterplot(ax=axes[2, 1], x= 'E_INCOME', y= 'FARE', data=air_df)
sns.scatterplot(ax=axes[2, 2], x= 'E_POP', y= 'FARE', data=air_df)

In [None]:
trainData.pivot_table(values = 'Online', index=['PersonalLoan'], columns = 'CreditCard', aggfunc='count')
trainData.pivot_table(index=['CreditCard', 'PersonalLoan'], columns = 'Online', aggfunc=len)

In [None]:
#QA Answers
pd.set_option('precision', 3)
print(trainData['PersonalLoan'].value_counts()/len(trainData))
predictors = ['Online', 'CreditCard']

for predictor in predictors:
    df = trainData[['PersonalLoan', predictor]]
    freqTable = df.pivot_table(index = ['PersonalLoan'], columns = predictor, aggfunc = len)
    propTable = freqTable.apply(lambda x: x / sum(x), axis = 1)
    print(propTable)
    
pd.reset_option('precision')

In [None]:
titanic_training[['Survived','Ticket']]\
.head(10)

In [None]:
##Example Submission
#example_submission = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/gender_submission.csv')
#example_submission.head()