In [40]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.multiclass import OneVsRestClassifier
from sklearn.utils import resample

In [62]:
# Read in csv file
df = pd.read_csv("/content/drive/MyDrive/HackathonData/train_combined_Species.csv", sep = ',')

In [63]:
# Drop the index column
df.drop(df.columns[0], axis=1, inplace=True)  

In [43]:
df.shape

(1006, 17788)

# DATA PREPROCESSING

In [64]:
# create a MinMaxScaler object
scaler = MinMaxScaler()

# select the columns to be scaled and create a new dataframe
df_temp = df.drop("label", axis=1)

# scale the selected columns
df_scaled = scaler.fit_transform(df_temp.values)

# create a new dataframe from the scaled columns
df_scaled = pd.DataFrame(df_scaled, columns=df_temp.columns)

# concatenate the scaled columns with the unscaled 'label' column
df_scaled_result = pd.concat([df_scaled, df['label']], axis=1)

df = df_scaled_result

# FEATURE SELECTION





In [65]:
# Selecting only the columns which contain only 2 unique values
n_unique = df.loc[:,df.apply(pd.Series.nunique) == 2]

if 'label' in n_unique.columns:

    # Select only the rows with 'label' value 0
    nicht_kranke = n_unique[n_unique['label'] == 0]

    for i in nicht_kranke.columns:
        if i != 'label':
            display(nicht_kranke[i].value_counts())

    # Select only the rows with 'label' value 1
    kranke = n_unique[n_unique['label'] == 1]
    
    for i in kranke.columns:
        # If the column is not 'label', display the value counts
        if i != 'label':
            print(kranke[i].value_counts())

    # Drop the 'label' column from n_unique
    n_unique.drop('label', axis=1, inplace=True)

# Create a list of columns from df which are not present in n_unique
cols = [col for col in df.columns if col not in n_unique.columns]

# If cols is not an empty list, select only those columns from df
if cols:
    df = df[cols]
else:
    # Create a copy of df and assign it to df
    df = df.copy()


df.shape


(1006, 11715)

# PCA, SVM

In [69]:
# Extract the features and target variables
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize a list to store the cross-validated accuracy for different PCA components
cv_scores = []

# Try a smaller range of PCA components
for n_components in range(1, min(X.shape[1]+1, 20)):
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    
    # Define the parameter grid for grid search
    param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]}
    
    svm = SVC()
    grid = GridSearchCV(svm, param_grid, cv=5)
    grid.fit(X_pca, y)
    cv_score = grid.best_score_
    cv_scores.append(cv_score)

# Write the results for different PCA components into an array
cv_scores = np.array(cv_scores)


In [70]:
print(cv_scores)

[0.45824344 0.47712428 0.47515393 0.49601005 0.5049702  0.50992562
 0.48807448 0.48309935 0.48111423 0.4870696  0.4850894  0.4830895
 0.48311906 0.47913896 0.48705482 0.48014384 0.48011428 0.4781193
 0.47912911]


# EVALUATE

In [51]:
# Print classification report
#print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.68      0.54        78
           1       0.54      0.43      0.48       104
           2       0.00      0.00      0.00        13
           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00         2

    accuracy                           0.49       202
   macro avg       0.20      0.22      0.20       202
weighted avg       0.45      0.49      0.46       202



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
