In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import * 
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.decomposition import *
from sklearn.linear_model import *

In [None]:
def clean_data(path, id_col = 0, is_test = False):
    df = pd.read_csv(path, index_col=id_col)
    cabins = df["Cabin"].tolist()
    decks = [np.NaN] * len(cabins)
    rows = [np.NaN] * len(cabins)
    sides = [np.NaN] * len(cabins)
    for i in range(len(cabins)):
        try:
            decks[i] = cabins[i][0:1]
            rows[i] = float(cabins[i][2:-2])
            sides[i] = cabins[i][-1:]
        except TypeError:
            continue
    df["Decks"] = decks
    df["Rows"] = rows
    df["Sides"] = sides
    deckValues = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7}
    sideValues = {'P': 0, 'S': 1}
    planets = {"Earth": 0, "Mars": 1, "Europa": 2}
    TorF = {True: 1, False: 0}
    destinations = {'TRAPPIST-1e': 0,'PSO J318.5-22': 1, '55 Cancri e': 2}
    names = df.Name.values.tolist()
    surnames = []
    for name in names:
        try:
            surname = name.split(" ")[-1].strip()
        except AttributeError:
            surname = np.NaN
        surnames.append(surname)
    surname_dict = {surnames[i] : i for i in range(len(surnames))}
    df["HomePlanet"] = df["HomePlanet"].replace(planets)
    for col in ["CryoSleep", "VIP"]:
        df[col] = df[col].replace(TorF)
    if not is_test:
       df["Transported"] = df["Transported"].replace(TorF) 
    df["Destination"] = df["Destination"].replace(destinations)
    df["Name"] = surnames
    df["Name"] = df["Name"].replace(surname_dict)
    df["Decks"] = df["Decks"].replace(deckValues)
    df["Sides"] = df["Sides"].replace(sideValues)
    new_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
            'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
            'Name', 'Decks', 'Rows', 'Sides', 'Transported'] if not is_test else ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
            'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
            'Name', 'Decks', 'Rows', 'Sides']
    df = df[new_cols]
    df = df.drop("Cabin", axis=1)

    return df


In [None]:
train_df = clean_data('data/train.csv', is_test = False)
train_df
sns.heatmap(train_df.corr())

In [None]:
test_df = clean_data('data/test.csv', is_test = True)
test_df

In [None]:
train = train_df.values

X_train = train[:, :-1]
y_train = train[:, -1]

In [None]:
# Imputation
knn_imp = KNNImputer(missing_values=np.nan, n_neighbors = 4)
X_train_imp = knn_imp.fit_transform(X_train)
X_train_imp

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imp)
X_train_scaled


In [None]:
pca = PCA()
pca.fit(X_train_scaled)

In [None]:
explained = pca.explained_variance_
explained_ratio = pca.explained_variance_ratio_
components = pca.components_
pr0 = components[-1]
vals = (X_train_scaled[1])
pr0

In [None]:
# plotting princomps 
dict1 = {f"PC{i}": [explained_ratio[i]] for i in range(len(explained_ratio))}
df = pd.DataFrame(dict1)

x = range(1, len(explained_ratio) + 1)
vars = explained_ratio
cum_vars = explained_ratio.cumsum()

fig, (ax1, ax2) = plt.subplots(ncols=2, sharex=True)
sns.lineplot(x = x, y = vars*100, marker="o", markerfacecolor="red",  ax=ax1)
sns.lineplot(x = x, y = cum_vars*100, marker="o", markerfacecolor="red", ax=ax2)

ax1.set_title('Component Variance')
ax2.set_title('Cumulative Variance')

xtick_pos = range(0, 14, 5)
y1tick_range = range(0, 36, 5)
y2tick_range = range(30, 101, 10)
add_pct = lambda lst: [f"{s}%" for s in lst]

ax1.set_xticks(xtick_pos)

ax1.set_yticks(y1tick_range)
ax2.set_yticks(y2tick_range)

ax1.set_yticklabels(add_pct(y1tick_range))
ax2.set_yticklabels(add_pct(y2tick_range))

ax1.set_xlabel("PC Number")
ax2.set_xlabel("PC Number")

ax1.grid(visible=True, which="minor")
ax2.grid(visible=True, which="minor")

# ax2.set_xticks(range(1, X.shape[1]+1, 2))

plt.show()

In [None]:
train_df.Transported.value_counts()

In [None]:
false = train_df.iloc[:, -1][train_df.iloc[:, -1] == 0].count()
true = train_df.iloc[:, -1][train_df.iloc[:, -1] == 1].count()

print(f"Transported: {true}")
print(f"Not Transported: {false}")

In [None]:
rng = np.random.default_rng(seed=111)
random_guesses = rng.choice([True, False], size = (4277,), p = [0.50362361, 0.49637639])
random_guesses

In [None]:
submission1_df = test_df.copy()
submission1_df["Transported"] = random_guesses
submission1_df = submission1_df.drop(submission1_df.columns[:-1], axis=1)
submission1_df.to_csv("submissions/s1.csv")

In [None]:
X_test = test_df.values
knn_imp = KNNImputer(missing_values=np.nan, n_neighbors = 4)
X_test_imp = knn_imp.fit_transform(X_test)
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test_imp)

In [None]:
logistic = LogisticRegression()
logistic.fit(X_train_scaled, y_train)
logistic_preds = logistic.predict(X_test_scaled)
logistic_preds = logistic_preds.astype(bool)


In [None]:
submission2_df = test_df.copy()
submission2_df["Transported"] = logistic_preds
submission2_df = submission2_df.drop(submission2_df.columns[:-1], axis=1)
submission2_df.to_csv("submissions/s2.csv")

In [None]:
def clean_data(path, id_col = 0, is_test = False):
    df = pd.read_csv(path, index_col=id_col)
    cabins = df["Cabin"].tolist()
    decks = [np.NaN] * len(cabins)
    rows = [np.NaN] * len(cabins)
    sides = [np.NaN] * len(cabins)
    for i in range(len(cabins)):
        try:
            decks[i] = cabins[i][0:1]
            rows[i] = float(cabins[i][2:-2])
            sides[i] = cabins[i][-1:]
        except TypeError:
            continue
    df["Decks"] = decks
    df["Rows"] = rows
    df["Sides"] = sides
    deckValues = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7}
    sideValues = {'P': 0, 'S': 1}
    planets = {"Earth": 0, "Mars": 1, "Europa": 2}
    TorF = {True: 1, False: 0}
    destinations = {'TRAPPIST-1e': 0,'PSO J318.5-22': 1, '55 Cancri e': 2}
    names = df.Name.values.tolist()
    surnames = []
    for name in names:
        try:
            surname = name.split(" ")[-1].strip()
        except AttributeError:
            surname = np.NaN
        surnames.append(surname)
    surname_dict = {surnames[i] : i for i in range(len(surnames))}
    df["HomePlanet"] = df["HomePlanet"].replace(planets)
    for col in ["CryoSleep", "VIP"]:
        df[col] = df[col].replace(TorF)
    if not is_test:
       df["Transported"] = df["Transported"].replace(TorF) 
    df["Destination"] = df["Destination"].replace(destinations)
    df["Name"] = surnames
    df["Name"] = df["Name"].replace(surname_dict)
    df["Decks"] = df["Decks"].replace(deckValues)
    df["Sides"] = df["Sides"].replace(sideValues)
    new_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
            'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
            'Name', 'Decks', 'Rows', 'Sides', 'Transported'] if not is_test else ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
            'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
            'Name', 'Decks', 'Rows', 'Sides']
    df = df[new_cols]
    df = df.drop("Cabin", axis=1)

    return df


In [None]:
test_df = clean_data('data/test.csv', is_test = True)
test_df

In [None]:
# Imputation
knn_imp = KNNImputer(missing_values=np.nan, n_neighbors = 4)
X_train_imp = knn_imp.fit_transform(X_train)
X_train_imp

In [None]:
pca = PCA()
pca.fit(X_train_scaled)

In [None]:
# plotting princomps 
dict1 = {f"PC{i}": [explained_ratio[i]] for i in range(len(explained_ratio))}
df = pd.DataFrame(dict1)

x = range(1, len(explained_ratio) + 1)
vars = explained_ratio
cum_vars = explained_ratio.cumsum()

fig, (ax1, ax2) = plt.subplots(ncols=2, sharex=True)
sns.lineplot(x = x, y = vars*100, marker="o", markerfacecolor="red",  ax=ax1)
sns.lineplot(x = x, y = cum_vars*100, marker="o", markerfacecolor="red", ax=ax2)

ax1.set_title('Component Variance')
ax2.set_title('Cumulative Variance')

xtick_pos = range(0, 14, 5)
y1tick_range = range(0, 36, 5)
y2tick_range = range(30, 101, 10)
add_pct = lambda lst: [f"{s}%" for s in lst]

ax1.set_xticks(xtick_pos)

ax1.set_yticks(y1tick_range)
ax2.set_yticks(y2tick_range)

ax1.set_yticklabels(add_pct(y1tick_range))
ax2.set_yticklabels(add_pct(y2tick_range))

ax1.set_xlabel("PC Number")
ax2.set_xlabel("PC Number")

ax1.grid(visible=True, which="minor")
ax2.grid(visible=True, which="minor")

# ax2.set_xticks(range(1, X.shape[1]+1, 2))

plt.show()

In [None]:
false = train_df.iloc[:, -1][train_df.iloc[:, -1] == 0].count()
true = train_df.iloc[:, -1][train_df.iloc[:, -1] == 1].count()

print(f"Transported: {true}")
print(f"Not Transported: {false}")

In [None]:
submission1_df = test_df.copy()
submission1_df["Transported"] = random_guesses
submission1_df = submission1_df.drop(submission1_df.columns[:-1], axis=1)
submission1_df.to_csv("submissions/s1.csv")

In [None]:
logistic = LogisticRegression()
logistic.fit(X_train_scaled, y_train)
logistic_preds = logistic.predict(X_test_scaled)
logistic_preds = logistic_preds.astype(bool)
