In [23]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Preprocess data
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

def get_wordnet_pos(word):
    try:
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    except Exception as e:
        # Handle exception here
        print(f"Exception occurred: {e}")
        return wordnet.NOUN

def preprocess_data(data):
    data['text'] = data['text'].str.lower()
    data['text'] = data['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(x)]))
    return data

# Load data
data = pd.read_csv('C:/Users/Josep/Downloads/Compressed/train.csv')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocess data
data = preprocess_data(data)

# Separate features and target
X = data['text']
y = data['target']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Define and train the model
lr_model = LogisticRegression(penalty='l2', C=1, solver='liblinear', max_iter=100)
lr_model.fit(X_train, y_train)

# Predict on the test data
y_pred = lr_model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Josep\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Josep\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Josep\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


              precision    recall  f1-score   support

           0       0.80      0.87      0.84       874
           1       0.81      0.71      0.75       649

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523



In [22]:
# Try a different strategy
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=300,max_depth=None,min_samples_split=2, random_state=0)
rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_rf))
classification_rep_rf = classification_report(y_test, y_pred_rf, output_dict=True)
classification_df_rf = pd.DataFrame(classification_rep_rf).transpose()
classification_df_rf.to_csv('classification_report_rf.csv', index=True)

              precision    recall  f1-score   support

           0       0.77      0.92      0.84       874
           1       0.86      0.64      0.73       649

    accuracy                           0.80      1523
   macro avg       0.81      0.78      0.78      1523
weighted avg       0.81      0.80      0.79      1523

