In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

# Read the CSV and Perform Basic Data Cleaning

In [None]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

# Select your features (columns)

In [None]:
# Set features. This will also be used as your x values.
df.columns
#selected_features = df[['names', 'of', 'selected', 'features', 'here']]
selected_features = df[df.columns]
selected_features = selected_features.drop(columns = 'koi_disposition')
X = selected_features
y = df['koi_disposition'].to_numpy()
#y = df['koi_disposition'].values.reshape(-1, 1)
y


In [None]:
selected_features

In [None]:
columns = selected_features.columns.to_list()

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

In [None]:
for label, original_class in zip(encoded_y, y):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

In [None]:
encoded_y

In [None]:
encoded_y = encoded_y.reshape(-1,1)

In [None]:
print(X.shape, y.shape)

In [None]:
encoded_y.shape

# Create a Train Test Split

Use `koi_disposition` for the y values

In [None]:
# Assign the X and y variables
X = df.drop("koi_disposition", axis=1)
y = df["koi_disposition"]

In [None]:
# Split data into training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, encoded_y_train, encoded_y_test = train_test_split(X, encoded_y, random_state=1)

In [None]:
X_train.head()

In [None]:
print(X_train.shape, encoded_y_train.shape)

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [None]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
#y_scaler = MinMaxScaler().fit(encoded_y_train)

In [None]:
X_scaled_train = X_scaler.transform(X_train)
X_scaled_test = X_scaler.transform(X_test)
#y_scaled_train = y_scaler.transform(encoded_y_train)
#y_scaled_test = y_scaler.transform(encoded_y_test)

In [None]:
X_scaled_train

In [None]:
encoded_y_train.ravel()

In [None]:
print(X_scaled_train.shape, encoded_y_train.shape)

# Train the Model



In [None]:
encoded_y_train = encoded_y_train.ravel()

In [None]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_scaled_train, encoded_y_train.ravel())
lsvc = model.fit(X_scaled_train, encoded_y_train.ravel())

In [None]:
print(f"Training Data Score: {model.score(X_scaled_train, encoded_y_train)}")
print(f"Testing Data Score: {model.score(X_scaled_test, encoded_y_test)}")

In [None]:
model.coef_[0]

In [None]:
print(model.coef_, columns)

In [None]:
def plot_coefficients(classifier, feature_names, top_features=40, class_num =1):

    coef = classifier.coef_[class_num].ravel()
    print("classifier.coef type",type(classifier.coef_))
    print("classifier.coef", classifier.coef_)
    print("type coef",type(coef))
    print("len(coeff)", len(coef))
    print("coef",coef)
    print("argsrt(coeff)",np.argsort(coef))
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    print(len(top_positive_coefficients))
    print(type(top_positive_coefficients))
    print(top_positive_coefficients)
    top_negative_coefficients = np.argsort(coef)[:top_features]
    print(top_negative_coefficients)
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
    print(top_coefficients)
    print(np.array(feature_names))
    #print(np.array(feature_names)[top_coefficients])
    #x_labels = np.array(feature_names)[top_coefficients]
    
# create plot
    plt.figure(figsize=(15, 5))
    colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
    print("coef[top_coefficients]",coef[top_coefficients])
    #print(feature_names[top_coefficients])
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    print("feature names",feature_names)
    print("top coefficients", top_coefficients)
    print("feature_name_top", feature_names[top_coefficients])
    features = feature_names[top_coefficients]
    plt.xticks(np.arange(1, 1 + 2 * top_features), features, rotation=60, ha='right')
    plt.show()

In [None]:
df_coef_1 = model.coef_[0]

In [None]:
cv = CountVectorizer()
cv.fit(selected_features)
print (len(cv.vocabulary_))
print(len(cv.get_feature_names()))
print (cv.get_feature_names())

In [None]:
plot_coefficients(model, cv.get_feature_names(), 20,0)

In [None]:
plot_coefficients(model, cv.get_feature_names(), 20, 1)

In [None]:
plot_coefficients(model, cv.get_feature_names(), 20, 2)

In [None]:
def f_importances(coef, names):
    imp = coef
    print(imp)
    print(names)
    imp,names = zip(*sorted(zip(imp,names)))
    plt.barh(range(len(names)), imp, align='center')
    plt.yticks(range(len(names)), names)
    plt.show()

In [None]:
df_coef_0 = pd.DataFrame({'coef' : model.coef_[0],'abs(coef_)': abs(model.coef_[0]), 'name': columns})
df_coef_0=df_coef_0.sort_values('abs(coef_)', ascending = False)[-20:].reset_index()
df_coef_0

In [None]:
df_coef_1 = pd.DataFrame({'coef' : model.coef_[1],'abs(coef_)': abs(model.coef_[1]), 'name': columns})
df_coef_1=df_coef_1.sort_values('abs(coef_)', ascending = False)[-20:].reset_index()
df_coef_1

In [None]:
df_coef_2 = pd.DataFrame({'coef' : model.coef_[2],'abs(coef_)': abs(model.coef_[2]), 'name': columns})
df_coef_2=df_coef_2.sort_values('abs(coef_)', ascending = False)[-20:].reset_index()
df_coef_2

In [None]:
#out of the bottom 20 coefficients in each of the 3 classes these below are the ones common to all three.
df_least_important = df_coef_0.merge(df_coef_1, on = 'name', how = 'inner').merge(df_coef_2, on = 'name', how = 'inner')

In [None]:
least_important_list = df_least_important['name'].to_list()
least_important_list

In [None]:
f_importances(model.coef_[0], columns)

In [None]:
len(columns)

In [None]:
pd.Series(abs(model.coef_[0]), index=columns).nlargest(30).plot(kind='barh')

In [None]:
from sklearn.feature_selection import SelectFromModel
model1 = SelectFromModel(lsvc, prefit=True)
X_new = model1.transform(X)
X_new

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [500,600, 700, 800 ,1000, 1100, 1200, 1300],
              'gamma': [1e-16, 1e-15,1e-14,1e-13]}
grid = GridSearchCV(model, param_grid, n_jobs = 10, verbose=3)

In [None]:
# Train the model with GridSearch
grid.fit(X_scaled_train, encoded_y_train)

In [None]:
grid

In [None]:
sorted(grid.cv_results_.keys())

In [None]:
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_index_)

In [None]:
df_results = pd.DataFrame({'col_1' : model.predict(X_scaled_test), 'col_2' : encoded_y_test.ravel()})
df_results

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'rubal.sav'
joblib.dump(model, filename)