In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

---

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_df = pd.read_csv(
    "https://exofop.ipac.caltech.edu/tess/download_toi.php?sort=toi&output=csv")
# Review the DataFrame
lending_df.head()

Unnamed: 0,TIC ID,TOI,Previous CTOI,Master,SG1A,SG1B,SG2,SG3,SG4,SG5,...,Stellar Radius (R_Sun) err,Stellar Metallicity,Stellar Metallicity err,Stellar Mass (M_Sun),Stellar Mass (M_Sun) err,Sectors,Date TOI Alerted (UTC),Date TOI Updated (UTC),Date Modified,Comments
0,231663901,101.01,,5,5,5,5,5,5,5,...,0.043847,,,1.05,0.129454,127,2018-09-05,2021-10-07,2022-12-14 12:09:24,WASP-46 b
1,149603524,102.01,,5,5,5,5,5,5,5,...,0.05,0.24,0.05,1.28,0.190812,"1,2,3,4,6,7,8,9,10,11,12,13,27,28,29,30,31,32,...",2019-05-07,2023-04-04,2023-04-07 12:13:06,WASP 62 b
2,336732616,103.01,,5,5,5,5,5,5,5,...,,,,1.27,0.196969,1,2018-09-05,2020-10-27,2022-12-14 12:09:24,HATS-3 b
3,231670397,104.01,,5,5,5,5,5,5,5,...,0.102573,,,1.16,0.166129,127,2018-09-05,2021-12-01,2022-12-14 12:09:24,WASP-73 b
4,144065872,105.01,,5,5,5,5,5,5,5,...,0.059699,,,1.03,0.127209,128,2018-09-05,2021-12-08,2022-12-14 12:09:24,WASP-95; epoch kept from qlp-s28-tois


In [3]:
# Drop the non-beneficial ID columns
TessOI_pre_df = lending_df.drop(['TIC ID', 'TOI', 'Previous CTOI', 'Master', 'SG1A', 'SG1B', 'SG2',
       'SG3', 'SG4', 'SG5', 'ESM', 'TSM', 'Predicted Mass (M_Earth)','RA', 'Dec',
       'Time Series Observations', 'Spectroscopy Observations',
       'Imaging Observations','TESS Disposition',
       'TESS Mag err', 'Planet Name', 'Pipeline Signal ID',
       'Source', 'Detection','PM RA (mas/yr)','PM Dec (mas/yr)',
       'PM RA err (mas/yr)', 'PM Dec err (mas/yr)', 'Epoch (BJD) err', 'Period (days) err',
       'Duration (hours) err', 'Depth (mmag)',
       'Depth (mmag) err', 'Depth (ppm) err',
       'Planet Radius (R_Earth) err',
       'Planet SNR',
       'Stellar Distance (pc) err',
       'Stellar Eff Temp (K) err',
       'Stellar log(g) (cm/s^2) err',
       'Stellar Radius (R_Sun) err',
       'Stellar Metallicity', 'Stellar Metallicity err',
       'Stellar Mass (M_Sun)', 'Stellar Mass (M_Sun) err', 'Sectors',
       'Date TOI Alerted (UTC)', 'Date TOI Updated (UTC)', 'Date Modified',
       'Comments'],1)
TessOI_pre_df.head()

Unnamed: 0,TFOPWG Disposition,TESS Mag,Epoch (BJD),Period (days),Duration (hours),Depth (ppm),Planet Radius (R_Earth),Planet Insolation (Earth Flux),Planet Equil Temp (K),Stellar Distance (pc),Stellar Eff Temp (K),Stellar log(g) (cm/s^2),Stellar Radius (R_Sun)
0,KP,12.4069,2459037.0,1.430369,1.643873,19151.216214,13.250493,1281.241792,1525.905097,375.31,5600.0,4.48851,0.890774
1,KP,9.7109,2460011.0,4.411938,3.728,15219.0,15.569,782.274,1473.0,175.631,6280.0,4.32092,1.21
2,KP,11.5232,2458327.0,3.547854,3.494333,10424.3718,14.581841,1212.004376,1504.858953,411.211,6351.0,4.22896,1.4
3,KP,9.8638,2459039.0,4.087299,5.586113,3572.17125,13.623773,2242.36054,1755.076738,316.678,6036.0,3.93359,2.21867
4,KP,9.4995,2459085.0,2.184667,2.86528,11708.022855,13.702853,1362.138143,1549.44098,137.544,5630.0,4.37759,1.23824


In [4]:
TessOI_TnF_df = TessOI_pre_df.drop(TessOI_pre_df.loc[(TessOI_pre_df["TFOPWG Disposition"] == "APC")|(TessOI_pre_df["TFOPWG Disposition"] == "PC")].index)

false_types_to_replace = ["FP","FA"]
true_types_to_replace = ["KP","CP"]
# Replace in dataframe
for app in false_types_to_replace:
    TessOI_TnF_df["TFOPWG Disposition"] = TessOI_TnF_df["TFOPWG Disposition"].replace(app,0)

for app in true_types_to_replace:
    TessOI_TnF_df["TFOPWG Disposition"] = TessOI_TnF_df["TFOPWG Disposition"].replace(app,1)
# Check to make sure binning was successful
TessOI_TnF_df["TFOPWG Disposition"].value_counts()

0.0    1003
1.0     855
Name: TFOPWG Disposition, dtype: int64

In [5]:
# Drop n/a rows
TessOI_TnF_df = TessOI_TnF_df.dropna()
TessOI_TnF_df.count()

TFOPWG Disposition                1604
TESS Mag                          1604
Epoch (BJD)                       1604
Period (days)                     1604
Duration (hours)                  1604
Depth (ppm)                       1604
Planet Radius (R_Earth)           1604
Planet Insolation (Earth Flux)    1604
Planet Equil Temp (K)             1604
Stellar Distance (pc)             1604
Stellar Eff Temp (K)              1604
Stellar log(g) (cm/s^2)           1604
Stellar Radius (R_Sun)            1604
dtype: int64

In [6]:
# Separate the data into labels and features
y = TessOI_TnF_df["TFOPWG Disposition"].values
X = TessOI_TnF_df.drop(["TFOPWG Disposition"],1).values

In [7]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1)


In [8]:
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using training data
logistic_regression_model.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

In [10]:
# Make a prediction using the testing data
testing_predictions = logistic_regression_model.predict(X_test_scaled)

In [11]:
# Print the balanced_accuracy score of the model
print('Test Acc: %.3f' % logistic_regression_model.score(X_test_scaled, y_test))

Test Acc: 0.746


In [12]:
# Generate a confusion matrix for the model
from sklearn.metrics import confusion_matrix

# Create and save the confusion matrix for the training data
testing_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the training data
print(testing_matrix)

[[120  58]
 [ 44 179]]


In [13]:
# Print the classification report for the model
from sklearn.metrics import classification_report
print(classification_report(y_test, testing_predictions))

              precision    recall  f1-score   support

         0.0       0.73      0.67      0.70       178
         1.0       0.76      0.80      0.78       223

    accuracy                           0.75       401
   macro avg       0.74      0.74      0.74       401
weighted avg       0.74      0.75      0.74       401

