In [None]:
# packages?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [48]:
# 1 data exploratory
print("\n" + "="*50)
print("EXPLORATORY DATA ANALYSIS")
print("="*50)

# Load in dataset
train_data = pd.read_csv('train_2025_2026.csv')

# inspect the dataset
print(train_data.head())

# Missing by column
missing_by_column = train_data.isnull().sum()
columns_with_missing = missing_by_column[missing_by_column > 0]
print(f"\nColumns with missing values: {len(columns_with_missing)}")
print(f"Columns without missing values: {len(train_data.columns) - len(columns_with_missing)}")


# Check if missingness is systematic (e.g., entire rows missing)
rows_with_any_missing = train_data.isnull().any(axis=1).sum()
rows_with_all_missing = train_data.isnull().all(axis=1).sum()
rows_with_half_missing = (train_data.isnull().sum(axis=1) > train_data.shape[1]//2).sum()

print(f"Rows with any missing values: {rows_with_any_missing} ({rows_with_any_missing/len(train_data)*100:.1f}%)")
print(f"Rows with all values missing: {rows_with_all_missing} ({rows_with_all_missing/len(train_data)*100:.1f}%)")
print(f"Rows with >50% values missing: {rows_with_half_missing} ({rows_with_half_missing/len(train_data)*100:.1f}%)")

# Check for block-wise missing patterns
print("\nMissing value patterns (first 10 columns with most missing):")
missing_analysis = pd.DataFrame({
    'Missing_Count': train_data.isnull().sum(),
    'Missing_Percentage': (train_data.isnull().sum() / len(train_data)) * 100
}).sort_values('Missing_Count', ascending=False)

print(missing_analysis.head(10))

# Proportion of classes in data
print("\n" + "="*50)
print("Proportion of classes in data")
class_counts = train_data['Outcome'].value_counts()
print(class_counts)

class_counts = train_data['Outcome'].value_counts()
class_proportions = train_data['Outcome'].value_counts(normalize=True) * 100

print("Absolute counts:")
print(class_counts)
print("\nProportions (%):")
print(class_proportions)

# # Display in a more readable format
# print("\n" + "-"*30)
# print("CLASS DISTRIBUTION SUMMARY")
# print("-"*30)
# for class_label, count in class_counts.items():
#     proportion = class_proportions[class_label]
#     print(f"Class {class_label}: {count:,} samples ({proportion:.2f}%)")

# # Calculate and display class imbalance ratio
# if len(class_counts) == 2:
#     majority_class = class_counts.idxmax()
#     minority_class = class_counts.idxmin()
#     imbalance_ratio = class_counts.max() / class_counts.min()
#     print(f"\nClass Imbalance Ratio: {imbalance_ratio:.2f}:1")
#     print(f"Majority class: {majority_class} ({class_counts.max():,} samples)")
#     print(f"Minority class: {minority_class} ({class_counts.min():,} samples)")

# # Visualize the class distribution
# plt.figure(figsize=(10, 6))


# Correlation



# Highly correlated genes/features

corr_matrix = train_data.corr()
print(corr_matrix)


# Maximum and minimum values




# Feature with highest variance



# Summary report
print("\n" + "="*50)
print("EXPLORATORY DATA ANALYSIS SUMMARY")
print("="*50)
print(f"Dataset: {train_data.shape[0]} rows, {train_data.shape[1]} columns")
print(f"Total missing values: {train_data.isnull().sum().sum()}")
# print(f"Columns with missing values: {len(columns_with_missing)}")
print(f"\nColumns with missing values: {len(columns_with_missing)}")
print(f"Columns without missing values: {len(train_data.columns) - len(columns_with_missing)}")




EXPLORATORY DATA ANALYSIS
   Gene 1  Gene 2  Gene 3   Gene 4  Gene 5  Gene 6  Gene 7  Gene 8  Gene 9  \
0     NaN   133.0    75.0  12971.0   401.0   296.0   348.0  1087.0  4224.0   
1     5.0   127.0    85.0  15875.0   285.0   305.0   298.0  1297.0  2090.0   
2     5.0    81.0   133.0      NaN   228.0   387.0   465.0  2657.0  3653.0   
3     5.0    75.0     NaN  17527.0   467.0   242.0   392.0  3482.0  3408.0   
4     4.0    74.0   103.0  13053.0   329.0     NaN   357.0  2714.0  2739.0   

   Gene 10  ...  Gene 1993  Gene 1994  Gene 1995  Gene 1996  Gene 1997  \
0      NaN  ...      101.0    16517.0      308.0      100.0      235.0   
1      8.0  ...       64.0    17950.0      458.0      191.0      137.0   
2     32.0  ...      138.0    11715.0      181.0      161.0      261.0   
3     59.0  ...       88.0    12420.0      403.0      114.0      290.0   
4     75.0  ...      129.0    11697.0      205.0      102.0      269.0   

   Gene 1998  Gene 1999  Gene 2000  Outcome    Id  
0      

In [None]:
# Answear to guided exercise 2:


from sklearn.utils import resample


imbalanced_data = pd.read_csv('train_2025_2026.csv')
#print(imbalanced_data.describe())

#we check the classes and it's counts
class_counts = imbalanced_data['Outcome'].value_counts()
print(class_counts)
print('--------')
#we save the number of samples of the class with less samples
num_min = min(imbalanced_data['Outcome'].value_counts())
print('Number of samples of the class with less samples:', num_min)
print('--------')
temp_list = []

# we iterate over all clases to resample each one with the same number of samples of the class with less samples. (Undersampling)
for class_ in imbalanced_data['Outcome'].unique():
    df_class = imbalanced_data[imbalanced_data['Outcome'] == class_]
    df_class_under = resample(df_class, replace=False, n_samples=num_min, random_state=42)
    temp_list.append(df_class_under)

#rebuild balanced df
balanced_data = pd.concat(temp_list)
print('New class sample distribution:', '\n', balanced_data['Outcome'].value_counts())
print('--------')
print('Head:', '\n', balanced_data.head())
print('--------')
print('Tail:', '\n',balanced_data.tail())
#save balanced .csv
#balanced_data.to_csv('training_balanced_data.csv')



Outcome
3    1280
2     881
1     730
0     629
Name: count, dtype: int64
--------
Number of samples of the class with less samples: 629
--------
New class sample distribution: 
 Outcome
3    629
0    629
2    629
1    629
Name: count, dtype: int64
--------
Head: 
       Gene 1  Gene 2  Gene 3   Gene 4  Gene 5  Gene 6  Gene 7  Gene 8  Gene 9  \
3415     4.0   103.0    54.0  12207.0   364.0   421.0   280.0  2803.0  2253.0   
1969     4.0   123.0   121.0  15007.0   228.0   311.0   433.0  2718.0     NaN   
2523     3.0   145.0    74.0  14686.0   418.0   433.0   386.0  1767.0  3057.0   
2173     6.0   106.0    94.0  16621.0   385.0   303.0   386.0  4111.0  2721.0   
2087     4.0    87.0     NaN  17093.0   407.0   306.0   326.0  4048.0  1892.0   

      Gene 10  ...  Gene 1993  Gene 1994  Gene 1995  Gene 1996  Gene 1997  \
3415     54.0  ...      122.0    18502.0      407.0      132.0      297.0   
1969     51.0  ...      127.0    12704.0      475.0      143.0      297.0   
2523     51.0  .

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
import pandas as pd


data = pd.read_csv('training_balanced_data.csv', index_col=0)
data_y = data['Outcome']
data_X = data.drop(columns = ['Outcome', 'Id'])


X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2,random_state=42)
#print(y_train.head())

model = make_pipeline(SimpleImputer(), StandardScaler(), LogisticRegression())

model.fit(X_train, y_train)
score = model.score(X_test, y_test)

