# Importing necessary libraries

In [95]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


## Loading datasets

In [86]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')


## Finding datatypes for better preprocessing

In [97]:
df_train.dtypes

Date_Time                  object
Sex                        object
Nation                     object
Job                        object
Freelance_Status           object
Genetic_Background         object
Therapy                    object
Indoor_Days                object
Increasing_Stress          object
Habit_Changes              object
Psychological_History      object
Emotional_Fluctuations     object
Adaptation_Challenges      object
Job_Engagement             object
Social_Vulnerability       object
Psych_Evaluation           object
Support_Options            object
Body_Weight               float64
dtype: object

## Finding null value count in each col

In [99]:
df_train.isnull().sum()

Date_Time                    0
Sex                          0
Nation                       1
Job                          0
Freelance_Status          4965
Genetic_Background           0
Therapy                      1
Indoor_Days                  3
Increasing_Stress            0
Habit_Changes                0
Psychological_History        2
Emotional_Fluctuations       0
Adaptation_Challenges        1
Job_Engagement               4
Social_Vulnerability         0
Psych_Evaluation             2
Support_Options              4
Body_Weight                  1
dtype: int64

In [98]:
df_test.dtypes

ID                        int64
Date_Time                object
Sex                      object
Nation                   object
Job                      object
Freelance_Status         object
Genetic_Background       object
Therapy                  object
Indoor_Days              object
Increasing_Stress        object
Habit_Changes            object
Psychological_History    object
Adaptation_Challenges    object
Job_Engagement           object
Social_Vulnerability     object
Psych_Evaluation         object
Support_Options          object
Body_Weight               int64
dtype: object

## Seperating inuput and target columns

In [100]:
X = df_train.drop(columns=['Emotional_Fluctuations', 'Date_Time'])
y = df_train['Emotional_Fluctuations']

## Preparing test data for prediction

In [101]:
test_ids = df_test['ID']
X_test = df_test.drop(columns=['ID', 'Date_Time'])

## Identifying categorical and numerical features


In [102]:
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

## Creating a numerical feature transformer


In [103]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

## Creating a categorical feature transformer


In [104]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

## Combining transformers with ColumnTransformer


In [105]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

## Building and evaluating the machine learning pipeline


In [106]:
model = RandomForestClassifier(n_estimators=100, random_state=0)

# Create and evaluate the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

## Splitting the data and fitting the pipeline on random forest classifier


In [107]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

## Evaluating

In [108]:
from sklearn.metrics import accuracy_score

y_pred = pipeline.predict(X_valid)

# Evaluate the model
print('Validation Accuracy:', accuracy_score(y_valid, y_pred))

Validation Accuracy: 0.9762916291629163


# Output file

In [109]:
pipeline.fit(X, y)
test_predictions = pipeline.predict(X_test)
submission = pd.DataFrame({'ID': test_ids, 'Emotional_Fluctuations': test_predictions})

# Save the predictions to a CSV file
submission.to_csv('submission.csv', index=False)