In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

In [2]:
# Read the train data
train_data = pd.read_csv('Train_Data.csv')

In [3]:
# basic EDA
train_data.head()

Unnamed: 0,pc,ld,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,ma,pred
0,y,83.71,63.0,7.2,52.5,14.0232,130.83,12.428,188.8425,8.952,201.1905,9.2896,141.9075,16.0968,150.339,12.488,173.124,m78,0
1,y,108.94,31.5,12.8,84.0,13.284,128.835,13.5256,183.099,8.852,207.2385,8.4704,154.7805,13.3304,101.0205,12.5096,131.4075,m78,0
2,E,169.65,0.0,5.6,73.5,14.5472,128.9295,13.6424,174.468,8.98,190.3125,11.3056,156.765,,122.535,11.7136,176.82,m76,1
3,x,122.42,31.5,7.2,63.0,15.0152,119.8575,12.3344,186.858,10.7208,193.8195,10.6096,175.749,,124.803,13.8424,168.2625,m55,0
4,E,125.43,94.5,7.2,42.0,14.4176,135.429,14.5824,187.8135,9.3088,203.154,9.828,172.704,14.472,120.2145,,150.1185,m76,0


In [4]:
# basic EDA
train_data.shape

(22584, 19)

In [5]:
# basic EDA
train_data.columns

Index(['pc', 'ld', 'm0', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8', 'm9',
       'm10', 'm11', 'm12', 'm13', 'm14', 'ma', 'pred'],
      dtype='object')

In [6]:
# basic EDA
train_data.dtypes

pc       object
ld      float64
m0      float64
m1      float64
m2      float64
m3      float64
m4      float64
m5      float64
m6      float64
m7      float64
m8      float64
m9      float64
m10     float64
m11     float64
m12     float64
m13     float64
m14     float64
ma       object
pred      int64
dtype: object

In [7]:
# Split the train data into features (X) and target variable (y)
X_train = train_data.drop('pred', axis=1)
y_train = train_data['pred']

In [9]:
# Encoding categorical variables
encoder = LabelEncoder()
X_train['pc'] = encoder.fit_transform(X_train['pc'])
X_train['ma'] = encoder.fit_transform(X_train['ma'].astype(str))

In [10]:
# Handling missing values
X_train = X_train.fillna(0)  # Replace NaN values with 0 or choose an appropriate strategy

In [11]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [12]:
# Train the model
model = XGBClassifier()
model.fit(X_train, y_train)

In [13]:
# Predict on the validation set
y_val_pred = model.predict(X_val)

In [14]:
# Evaluate the model
f1 = f1_score(y_val, y_val_pred)
print("F1 score:", f1)

F1 score: 0.06010928961748634


In [28]:
# Read the test data
test_data = pd.read_csv('Test_Data.csv')
test_data = test_data[X_train.columns]

In [29]:
# Preprocess the test data
test_data['pc'] = test_data['pc'].map(lambda s: encoder.transform([s])[0] if s in encoder.classes_ else -1)
test_data['ma'] = test_data['ma'].map(lambda s: encoder.transform([s])[0] if s in encoder.classes_ else -1)
test_data = test_data.fillna(0)  # Replace NaN values with 0 or choose an appropriate strategy


In [30]:
# Predict on the test data
y_test_pred = model.predict(test_data)

In [31]:
# Prepare submission
submission = pd.DataFrame({'pred': y_test_pred})

# Save the submission to a CSV file
submission.to_csv('submission.csv', index=False)