In [86]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder

from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, VotingClassifier, StackingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import RidgeClassifier
import xgboost as xgb
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [87]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Pre-Processing

In [89]:
df = pd.read_csv("C:/Users/akash/Documents/GitHub/threepc/essays.csv", encoding="cp1252")
df.columns

Index(['TEXT', 'extraversion', 'neuroticism', 'agreeableness',
       'conscientiousness', 'openness'],
      dtype='object')

In [90]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization and lowercasing
    tokens = [word for word in tokens if word.isalnum()]  # Remove punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)

In [91]:
X_preprocessed = [preprocess_text(essay) for essay in df['TEXT']]

In [92]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_preprocessed)
pd.set_option('future.no_silent_downcasting', True)
y = df[['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']].replace({'y': 1, 'n': 0})
print(y.head())
y = y.fillna(0)


  openness conscientiousness extraversion agreeableness neuroticism
0        1                 0            0             1           1
1        0                 0            0             1           0
2        1                 1            0             0           1
3        0                 1            1             1           0
4        1                 0            1             1           0


In [93]:
ohe_cols = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']

ohe_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

col_trans = ColumnTransformer(transformers=[
    ('ohe_p', ohe_pipeline, ohe_cols),
    ],
    remainder='passthrough', 
    n_jobs=-1)

pipeline = Pipeline(steps=[
    ('preprocessing', col_trans)
])

After Pre-Processing

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [99]:
print(np.unique(y_train))
print(y_train.shape)
y_train = y_train.astype(int)

[0 1]
(1973, 5)


In [100]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
multi_target_rf = MultiOutputClassifier(rf, n_jobs=-1)
multi_target_rf.fit(X_train, y_train)
y_pred = multi_target_rf.predict(X_test)

In [101]:
y_pred = multi_target_rf.predict(X_test)

In [102]:
print(y_test.shape)
print(y_pred.shape)  
y_test = y_test.astype(int)
y_pred = y_pred.astype(int)
print("Labels in y_test:", np.unique(y_test))
print("Labels in y_pred:", np.unique(y_pred))

(494, 5)
(494, 5)
Labels in y_test: [0 1]
Labels in y_pred: [0 1]


In [103]:
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.63      0.56      0.59       257
           1       0.57      0.55      0.56       267
           2       0.54      0.55      0.55       267
           3       0.57      0.66      0.61       274
           4       0.52      0.59      0.55       234

   micro avg       0.56      0.58      0.57      1299
   macro avg       0.57      0.58      0.57      1299
weighted avg       0.57      0.58      0.57      1299
 samples avg       0.58      0.60      0.53      1299



In [104]:
df.describe()

Unnamed: 0,TEXT,extraversion,neuroticism,agreeableness,conscientiousness,openness
count,2467,2467,2467,2467,2467,2467
unique,2467,2,2,2,2,2
top,"Well, right now I just woke up from a mid-day ...",y,n,y,y,y
freq,1,1276,1234,1310,1253,1271


In [105]:
from sklearn.multioutput import MultiOutputRegressor

from sklearn.linear_model import LogisticRegression

# Initialize your logistic regression model
lr = LogisticRegression(max_iter=1000)

# Wrap it in a MultiOutputRegressor
multi_output_lr = MultiOutputRegressor(lr)

# Fit the model on the training data
multi_output_lr.fit(X_train, y_train)

In [106]:
y_pred = multi_output_lr.predict(X_test)

print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.64      0.65      0.65       257
           1       0.61      0.57      0.59       267
           2       0.57      0.58      0.58       267
           3       0.59      0.69      0.64       274
           4       0.55      0.61      0.58       234

   micro avg       0.59      0.62      0.61      1299
   macro avg       0.59      0.62      0.61      1299
weighted avg       0.59      0.62      0.61      1299
 samples avg       0.62      0.64      0.56      1299



In [107]:
# Initialize an XGBoost classifier
xgb_model = xgb.XGBClassifier( eval_metric='logloss')

# Wrap it in MultiOutputClassifier to handle multiple labels
multi_xgb = MultiOutputClassifier(xgb_model)

# Train the model
multi_xgb.fit(X_train, y_train)

In [108]:
y_pred = multi_xgb.predict(X_test)

print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.61      0.58      0.60       257
           1       0.55      0.49      0.52       267
           2       0.58      0.57      0.57       267
           3       0.55      0.60      0.57       274
           4       0.52      0.57      0.54       234

   micro avg       0.56      0.56      0.56      1299
   macro avg       0.56      0.56      0.56      1299
weighted avg       0.56      0.56      0.56      1299
 samples avg       0.59      0.58      0.52      1299



params = {
    "estimator__learning_rate": [0.01, 0.1, 0.3],
    "estimator__n_estimators": [100, 300, 500],
    "estimator__max_depth": [3, 5, 7]
}

grid_search = GridSearchCV(multi_xgb, param_grid=params, cv=3, scoring='f1_macro', verbose=1)
grid_search.fit(X_train, y_train)

# Best parameters
print(grid_search.best_params_)

In [110]:
ridge_multi = MultiOutputClassifier(RidgeClassifier())
ridge_multi.fit(X_train, y_train)
y_pred = ridge_multi.predict(X_test)

In [111]:
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.62      0.59      0.61       257
           1       0.58      0.55      0.57       267
           2       0.56      0.52      0.54       267
           3       0.59      0.62      0.60       274
           4       0.54      0.62      0.57       234

   micro avg       0.58      0.58      0.58      1299
   macro avg       0.58      0.58      0.58      1299
weighted avg       0.58      0.58      0.58      1299
 samples avg       0.60      0.60      0.54      1299



In [112]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
gbc = GradientBoostingClassifier()
multi_gbc = MultiOutputClassifier(gbc)
multi_gbc.fit(X_train, y_train)

In [113]:
y_pred = multi_gbc.predict(X_test)

In [114]:
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.62      0.63      0.63       257
           1       0.59      0.55      0.57       267
           2       0.57      0.55      0.56       267
           3       0.58      0.64      0.61       274
           4       0.53      0.63      0.57       234

   micro avg       0.58      0.60      0.59      1299
   macro avg       0.58      0.60      0.59      1299
weighted avg       0.58      0.60      0.59      1299
 samples avg       0.59      0.63      0.55      1299



In [115]:
voting_clf = VotingClassifier(
    estimators=[('gbc', multi_gbc), ('ridge', ridge_multi), ('xgb', multi_xgb)],
    voting='soft'  # 'soft' uses predicted probabilities, 'hard' uses majority vote
)
y.shape
y.describe()
y.head()

Unnamed: 0,openness,conscientiousness,extraversion,agreeableness,neuroticism
0,1,0,0,1,1
1,0,0,0,1,0
2,1,1,0,0,1
3,0,1,1,1,0
4,1,0,1,1,0


In [116]:
voting_clf.fit(X_train, y_train)

# Predict on test data
y_pred = voting_clf.predict(X_test)

# Evaluate
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

NotImplementedError: VotingClassifier only supports binary or multiclass classification. Multilabel and multi-output classification are not supported.

In [None]:
stack_clf = StackingClassifier(
    estimators=[('multi_xgb', multi_xgb), ('multi_ridge', ridge_multi), ('multi_gbc', multi_gbc)],
    final_estimator=RidgeClassifier()
)
multi_stack_clf = MultiOutputClassifier(stack_clf)
# Train the model
multi_stack_clf.fit(X_train, y_train)

# Predictions
y_pred = stack_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)