# Detecting ExoPlanets

## First Task
- Exploring data set
- Finding missing values
- Finding outliers and determining what to do with them

In [None]:
# Loading the data (change this if you want other var-names, etc.)
import pandas as pd

exoplanet_df = pd.read_csv('exoplanet_dataset.csv')

print(exoplanet_df.shape, "- 9564 rows with 49 features")

pd.set_option('display.max_columns', None)
exoplanet_df.head()

In [None]:
# For an easier comprehension, we will rename the columns into their description.

exoplanet_df = exoplanet_df.rename(columns={'kepid':'KepID',
'kepoi_name':'KOIName',
'kepler_name':'KeplerName',
'koi_disposition':'ExoplanetArchiveDisposition',
'koi_pdisposition':'DispositionUsingKeplerData',
'koi_score':'DispositionScore',
'koi_fpflag_nt':'NotTransit-LikeFalsePositiveFlag',
'koi_fpflag_ss':'koi_fpflag_ss',
'koi_fpflag_co':'CentroidOffsetFalsePositiveFlag',
'koi_fpflag_ec':'EphemerisMatchIndicatesContaminationFalsePositiveFlag',
'koi_period':'OrbitalPeriod, days',
'koi_period_err1':'OrbitalPeriodUpperUnc, days',
'koi_period_err2':'OrbitalPeriodLowerUnc, days',
'koi_time0bk':'TransitEpoch, BKJD',
'koi_time0bk_err1':'TransitEpochUpperUnc, BKJD',
'koi_time0bk_err2':'TransitEpochLowerUnc, BKJD',
'koi_impact':'ImpactParamete',
'koi_impact_err1':'ImpactParameterUpperUnc',
'koi_impact_err2':'ImpactParameterLowerUnc',
'koi_duration':'TransitDuration, hrs',
'koi_duration_err1':'TransitDurationUpperUnc, hrs',
'koi_duration_err2':'TransitDurationLowerUnc, hrs',
'koi_depth':'TransitDepth, ppm',
'koi_insol':'InsolationFlux, Earthflux',
'koi_insol_err1':'InsolationFluxUpperUnc, Earthflux',
'koi_insol_err2':'InsolationFluxLowerUnc, Earthflux',
'koi_model_snr':'TransitSignal-to-Noise',
'koi_tce_plnt_num':'TCEPlanetNumber',
'koi_tce_delivname':'TCEDeliver',
'koi_steff':'StellarEffectiveTemperature, K',
'koi_steff_err1':'StellarEffectiveTemperatureUpperUnc, K',
'koi_steff_err2':'StellarEffectiveTemperatureLowerUnc, K',
'koi_depth_err1':'TransitDepthUpperUnc, ppm',
'koi_depth_err2':'TransitDepthLowerUnc, ppm',
'koi_prad':'PlanetaryRadius, Earthradii',
'koi_prad_err1':'PlanetaryRadiusUpperUnc, Earthradii',
'koi_prad_err2':'PlanetaryRadiusLowerUnc, Earthradii',
'koi_teq':'EquilibriumTemperature, K',
'koi_teq_err1':'EquilibriumTemperatureUpperUnc, K',
'koi_teq_err2':'EquilibriumTemperatureLowerUnc, K',
'koi_slogg':'StellarSurfaceGravity, log10(cm/s^2)',
'koi_slogg_err1':'StellarSurfaceGravityUpperUnc, log10(cm/s^2)',
'koi_slogg_err2':'StellarSurfaceGravityLowerUnc, log10(cm/s^2)',
'koi_srad':'StellarRadius, Solarradii',
'koi_srad_err1':'StellarRadiusUpperUnc, Solarradii',
'koi_srad_err2':'StellarRadiusLowerUnc, Solarradii',
'ra':'RA, decimaldegrees',
'dec':'Dec, decimaldegrees',
'koi_kepmag':'Kepler-band, mag'
})

In [None]:
#print distinct values of DispositionUsingKeplerData and ExoplanetArchiveDisposition
print(exoplanet_df['DispositionUsingKeplerData'].unique())
print(exoplanet_df['ExoplanetArchiveDisposition'].unique())

In [None]:
import seaborn as sns
sns.countplot(x = exoplanet_df['DispositionUsingKeplerData'])
print(exoplanet_df['DispositionUsingKeplerData'].value_counts())

In [None]:
sns.countplot(x= exoplanet_df['ExoplanetArchiveDisposition'])
print(exoplanet_df['ExoplanetArchiveDisposition'].value_counts())

In [None]:
#find percentege of missing values for each column, print as dataframe
missing_values = exoplanet_df.isnull().sum().sort_values(ascending=False)
percentage_missing_values = ((missing_values/len(exoplanet_df))*100).round(2)
percentage_missing_values = percentage_missing_values.to_frame()
percentage_missing_values.columns = ['Percentage of missing values']
percentage_missing_values

#visualize 5 columns with most missing values
from matplotlib import pyplot as plt
sns.barplot(x=percentage_missing_values.index[0:5], y='Percentage of missing values', data=percentage_missing_values[0:5])
plt.xticks(rotation=90)
plt.show()


In [None]:
#identify potential outliers of numerical columns
numerical_columns = exoplanet_df.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_columns.drop(['KepID'])

#iterate through numerical columns and calculate number of values Q1-1.5*IQR and Q3+1.5*IQR
for column in numerical_columns:
    q1 = exoplanet_df[column].quantile(0.25)
    q3 = exoplanet_df[column].quantile(0.75)
    iqr = q3-q1
    lower_limit = q1-1.5*iqr
    upper_limit = q3+1.5*iqr
    print(column, ":", exoplanet_df[(exoplanet_df[column]<lower_limit) | (exoplanet_df[column]>upper_limit)][column].count())

In [None]:
#drop columns with 100% missing values
exoplanet_df = exoplanet_df.drop(['EquilibriumTemperatureLowerUnc, K', 'EquilibriumTemperatureUpperUnc, K'], axis=1)
#remove irrelevant columns (names, ids, etc.)
exoplanet_df = exoplanet_df.drop(['KepID', 'KOIName', 'KeplerName', 'TCEPlanetNumber', 'TCEDeliver'], axis=1)
#remove NaN values
exoplanet_df = exoplanet_df.dropna()

#replace outliers with median
# numerical_columns = exoplanet_df.select_dtypes(include=['int64', 'float64']).columns
# for column in numerical_columns:
#     q1 = exoplanet_df[column].quantile(0.25)
#     q3 = exoplanet_df[column].quantile(0.75)
#     iqr = q3-q1
#     lower_limit = q1-1.5*iqr
#     upper_limit = q3+1.5*iqr
#     exoplanet_df[column] = exoplanet_df[column].mask((exoplanet_df[column]<lower_limit) | (exoplanet_df[column]>upper_limit), exoplanet_df[column].median())


### *Our choice was to keep outliers in case they represent useful values in the data or anomalies that may prove to be useful in predictions

# 2. Feature engineering
- Removing columns with 100% missing values
- Removing irrelevant columns

In [None]:
#create columns based on 'DispositionUsingKeplerData' and 'ExoplanetArchiveDisposition' columns named 'KeplerDispositionStatus' and 'ArchiveDispositionStatus'
#if 'FALSE POSITIVE' then 0, else if 'Candidate' then 1, else if 'CONFIRMED' then 2
exoplanet_df['KeplerDispositionStatus'] = exoplanet_df['DispositionUsingKeplerData'].apply(lambda x: 0 if x == 'FALSE POSITIVE' else (1 if x == 'CANDIDATE' else 2))
exoplanet_df['ArchiveDispositionStatus'] = exoplanet_df['ExoplanetArchiveDisposition'].apply(lambda x: 0 if x == 'FALSE POSITIVE' else (1 if x == 'CANDIDATE' else 2))
#drop 'DispositionUsingKeplerData' and 'ExoplanetArchiveDisposition' columns
exoplanet_df = exoplanet_df.drop(['DispositionUsingKeplerData', 'ExoplanetArchiveDisposition'], axis=1)

In [None]:
#find correlation with 'KeplerDispositionStatus' (target) and sort values
correlation_with_target = exoplanet_df.corrwith(exoplanet_df['KeplerDispositionStatus']).sort_values(ascending=False)
print(correlation_with_target)

#store columns with correlation < x in a list for later dropping
x = 0.2
columns_to_drop = []
for i in range(len(correlation_with_target)):
    if abs(correlation_with_target.iloc[i]) < x:
        # print(correlation_with_target.index[i], correlation_with_target.iloc[i])
        columns_to_drop.append(correlation_with_target.index[i]) 

In [None]:
#drop columns with correlation < x
# exoplanet_df = exoplanet_df.drop(columns_to_drop, axis=1)

In [None]:
#place 'KeplerDispositionStatus' as target column and remove 'DispositionScore', 'KeplerDispositionStatus' and 'ArchiveDispositionStatus' columns
target_column = exoplanet_df['KeplerDispositionStatus']
model_df = exoplanet_df.drop(['DispositionScore', 'KeplerDispositionStatus', 'ArchiveDispositionStatus'], axis=1)

In [None]:
numerical_columns = model_df.select_dtypes(include=['int64', 'float64']).columns
#create a correlation matrix of numerical columns
correlation_matrix = model_df[numerical_columns].corr().round(2)

#print columns with correlation > x and store them in a list for later dropping
x = 0.75
columns_to_drop = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > x:
            print(correlation_matrix.columns[i], correlation_matrix.columns[j], correlation_matrix.iloc[i, j])
            columns_to_drop.append(correlation_matrix.columns[i])

In [None]:
#drop columns with correlation > x
model_df = model_df.drop(columns_to_drop, axis=1)

In [None]:
#get min & max values and skewness of numerical columns and print as dataframe
from scipy.stats import skew
numerical_columns = model_df.select_dtypes(include=['int64', 'float64']).columns
min_values = model_df[numerical_columns].min()
max_values = model_df[numerical_columns].max()
skewness = model_df[numerical_columns].skew()
skewness = skewness.round(2)
skewness = skewness.to_frame()
skewness.columns = ['Skewness']

val_range = min_values.to_frame()
val_range.columns = ['Min']
val_range['Max'] = max_values

In [None]:
#transformation
#...

In [None]:
#scaling
#scale data by dividing by L2 norm
# from sklearn.preprocessing import Normalizer
# normalizer = Normalizer()
# model_df = normalizer.fit_transform(model_df)

#scale data by dividing by Standard Deviation
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
model_df = scaler.fit_transform(model_df)

# 3 + 4 Splitting the data and training the model
- Splitting into test and train data
- fitting & tuning KNN model
- Using Validation & Cross Validation to determine the best neighbors hyperparamater for KNN

In [None]:
#split data into train, test and validation sets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

X_trainVal, X_test, y_trainVal, y_test = train_test_split(model_df, target_column, stratify=target_column, test_size=0.25, random_state=69)
X_train, X_val, y_train, y_val = train_test_split(X_trainVal, y_trainVal, test_size=0.25, random_state=69)

In [None]:
best_score = 0
for num_neighbors in range(1,15):
    # Learn the model with a certain numnber of neighbors
    knn = KNeighborsClassifier(n_neighbors=num_neighbors)
    knn.fit(X_train, y_train)
    
    # Evaluate the model
    score = knn.score(X_val, y_val)
    
    # If improvement, store score and parameter
    if score>best_score:
        best_score = score
        best_num_neighbors = num_neighbors

# Build a model on the combine training and valiation data
knn = KNeighborsClassifier(n_neighbors=best_num_neighbors)
knn.fit(X_trainVal, y_trainVal)

print("Best number of neighbors found: {}".format(best_num_neighbors))
print("Best score on validation set: {}".format(best_score))
print("Score on training/validation set: {}".format(knn.score(X_trainVal, y_trainVal)))
print("Score on test set: {}".format(knn.score(X_test, y_test)))

In [None]:
#selecting hyperparameters using cross-validation
best_num_neighbors = 0
best_score = 0
for num_neighbors in range(1,15):
    # Set a certain number of neighbors
    knn = KNeighborsClassifier(n_neighbors=num_neighbors)
    
    # Perform cross validation
    scores = cross_val_score(knn, X_trainVal, y_trainVal, cv=5)
    
    # Compute the mean score
    score = scores.mean()
    print("Number of neighbors: {}, score: {}".format(num_neighbors, score))
    
    # If improvement, store score and parameter
    if score > best_score:
        best_score = score
        best_num_neighbors = num_neighbors

# Build a model on the combine training and valiation data
knn = KNeighborsClassifier(n_neighbors=best_num_neighbors)
knn.fit(X_trainVal, y_trainVal)

print("Best number of neighbors found: {}".format(best_num_neighbors))
print("Best average score: {}".format(best_score))
print("Score on training/validation set: {}".format(knn.score(X_trainVal, y_trainVal)))
print("Score on test set: {}".format(knn.score(X_test, y_test)))

# 5. Writing functions for ROC Curve, Precision-Recall Curve and one for the Confussion Matrix

In [None]:
#calculate accuracy, precision, recall and f1-score
from sklearn.metrics import classification_report
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
#visualize confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

#visualize confusion matrix as heatmap
heatmap = sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d')
heatmap.set_xlabel('Predicted')
heatmap.set_ylabel('True')
heatmap.set_title('Confusion Matrix')

In [None]:
#display precision-recall curve
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
plt.plot(precision, recall)
plt.xlabel('Precision')
plt.ylabel('Recall')
plt.title('Precision-Recall Curve')
plt.show()

In [None]:
#display ROC curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()


In [None]:
train_accuracy = []
test_accuracy = []
neighoursRange = range(1,25)
for i in neighoursRange:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    train_accuracy.append(knn.score(X_train, y_train))
    test_accuracy.append(knn.score(X_test, y_test))

plt.plot(neighoursRange, train_accuracy, label='train')
plt.plot(neighoursRange, test_accuracy, label='test')

# Trying other models

In [None]:
#naive bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
print("Accuracy on training set: {}".format(gnb.score(X_train, y_train)))
print("Accuracy on test set: {}".format(gnb.score(X_test, y_test)))

In [None]:
#linear regression
from sklearn.linear_model import LinearRegression
linreg = LinearRegression().fit(X_train, y_train)
print("Accuracy on training set: {}".format(linreg.score(X_train, y_train)))
print("Accuracy on test set: {}".format(linreg.score(X_test, y_test)))