In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
submission_df = pd.read_csv("sample_submission.csv")

# Inspect the data
print(train_df.head())
print(test_df.head())

# Data preprocessing
# Handle missing values if any
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

# Encode categorical variables
le = LabelEncoder()
for col in train_df.select_dtypes(include='object').columns:
    if col != 'Target':  # Skip the target column in training data
        train_df[col] = le.fit_transform(train_df[col])
        test_df[col] = le.transform(test_df[col])

# Split the data into features and target variable
X = train_df.drop('Target', axis=1)
y = train_df['Target']

# Encode the target variable
y = le.fit_transform(y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test_df_scaled = scaler.transform(test_df)

# Train a model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy * 100:.2f}%')

# Make predictions on the test set
test_predictions = model.predict(test_df_scaled)

# Decode the predictions
test_predictions = le.inverse_transform(test_predictions)

# Prepare the submission
submission_df['Target'] = test_predictions
submission_df.to_csv("sample_submission.csv", index=False)

print('Submission file created successfully.')


   id  Marital status  Application mode  ...  Inflation rate   GDP    Target
0   0               1                 1  ...             0.6  2.02  Graduate
1   1               1                17  ...             0.6  2.02   Dropout
2   2               1                17  ...             0.3 -0.92   Dropout
3   3               1                 1  ...             0.6  2.02  Enrolled
4   4               1                 1  ...             2.6  0.32  Graduate

[5 rows x 38 columns]
      id  Marital status  ...  Inflation rate   GDP
0  76518               1  ...            -0.3  0.79
1  76519               1  ...             0.6  2.02
2  76520               1  ...             2.8 -4.06
3  76521               1  ...             1.4  3.51
4  76522               1  ...             2.6  0.32

[5 rows x 37 columns]
Validation Accuracy: 82.78%
Submission file created successfully.
