In [100]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [82]:
data = pd.read_csv('train.csv')
data = data.dropna()
data

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000,0
2,013f2bd269f5,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,...,7.709560,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,fd1dd68d51b4,0.175193,2607.26686,85.200147,7.067354,8.138688,4.030845,0.025578,3.396778,1.229900,...,0.173229,1.89486,1.395238,16.911036,246.093155,10960.364830,38.380254,41.007968,21.978000,0
612,fd3dafe738fd,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,...,0.173229,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000,0
613,fd895603f071,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,...,10.223150,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340,0
614,fd8ef6377f76,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,...,0.173229,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000,0


In [83]:
data['EJ'].value_counts()

B    329
A    219
Name: EJ, dtype: int64

In [84]:
data['Class'].value_counts()

0    446
1    102
Name: Class, dtype: int64

In [86]:
X = data.drop(['Id', 'Class'], axis=1)
y = data['Class']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=100)

categorical_cols = ['EJ']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ],
    remainder='passthrough'
)

In [87]:
X_train_encoded = preprocessor.fit_transform(X_train)

X_val_encoded = preprocessor.transform(X_val)

lr_model = LogisticRegression()
lr_model.fit(X_train_encoded, y_train)

y_pred_lr = lr_model.predict(X_val_encoded)

accuracy_lr = accuracy_score(y_val, y_pred_lr)
print("Logistic Regression Accuracy:", accuracy_lr)

Logistic Regression Accuracy: 0.9181818181818182


In [98]:
conf_matrix_lr = confusion_matrix(y_val, y_pred_lr)

precision_lr = precision_score(y_val, y_pred_lr)
recall_lr = recall_score(y_val, y_pred_lr)
f1_score_lr = f1_score(y_val, y_pred_lr)

print("Logistic Regression Confusion Matrix:")
print(conf_matrix_lr)
print("Precision:", precision_lr)
print("Recall:", recall_lr)
print("F1-score:", f1_score_lr)

Logistic Regression Confusion Matrix:
[[90  4]
 [ 5 11]]
Precision: 0.7333333333333333
Recall: 0.6875
F1-score: 0.7096774193548386


In [133]:
adaboost_model = AdaBoostClassifier(base_estimator=lr_model, n_estimators=29, random_state=10)
adaboost_model.fit(X_train_encoded, y_train)

y_pred_adaboost = adaboost_model.predict(X_val_encoded)

accuracy_adaboost = accuracy_score(y_val, y_pred_adaboost)
print("AdaBoost Accuracy with Logistic Regression base estimator:", accuracy_adaboost)

AdaBoost Accuracy with Logistic Regression base estimator: 0.9363636363636364




In [134]:
conf_matrix_lr = confusion_matrix(y_val, y_pred_adaboost)

precision_lr = precision_score(y_val, y_pred_adaboost)
recall_lr = recall_score(y_val, y_pred_adaboost)
f1_score_lr = f1_score(y_val, y_pred_adaboost)

print("AdaBoost Accuracy with Logistic Regression Confusion Matrix:")
print(conf_matrix_lr)
print("Precision:", precision_lr)
print("Recall:", recall_lr)
print("F1-score:", f1_score_lr)

AdaBoost Accuracy with Logistic Regression Confusion Matrix:
[[91  3]
 [ 4 12]]
Precision: 0.8
Recall: 0.75
F1-score: 0.7741935483870969


In [135]:
test = pd.read_csv('test.csv')

In [136]:
test

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,00eed32682bb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,010ebe33f668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,02fa521e1838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,040e15f562a2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,046e85c7cc7f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [152]:
train_data = pd.read_csv('train.csv')
train_data = train_data.dropna()

X_train = train_data.drop(['Id', 'Class'], axis=1)
y_train = train_data['Class']

categorical_cols = ['EJ']
numerical_cols = [col for col in X_train.columns if col not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first'), categorical_cols)  # Set drop='first' to remove the first category as reference
    ],
    remainder='passthrough'
)

X_train_encoded = preprocessor.fit_transform(X_train)

lr_model = LogisticRegression()

adaboost_model = AdaBoostClassifier(estimator=lr_model, n_estimators=50, random_state=42)
adaboost_model.fit(X_train_encoded, y_train)

test_data = pd.read_csv('test.csv')

test_ids = test_data['Id']
test_features = test_data.drop('Id', axis=1)

test_features_encoded = preprocessor.transform(test_features)

probabilities = adaboost_model.predict_proba(test_features_encoded)

probabilities_class_0 = probabilities[:, 0]
probabilities_class_1 = probabilities[:, 1]

submission_df = pd.DataFrame()
submission_df["Ids"] = test_ids
submission_df["class_0"] = probabilities_class_0
submission_df["class_1"] = probabilities_class_1

: 

In [150]:
submission_df

Unnamed: 0,Ids,class_0,class_1
0,00eed32682bb,0.492623,0.507377
1,010ebe33f668,0.492623,0.507377
2,02fa521e1838,0.492623,0.507377
3,040e15f562a2,0.492623,0.507377
4,046e85c7cc7f,0.492623,0.507377


In [2]:
df.to_csv('submission.csv')