In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.model_selection import GridSearchCV
from nltk.stem import WordNetLemmatizer
from xgboost import XGBClassifier
from xgboost import XGBClassifier


In [3]:
pip show imbalanced-learn

Name: imbalanced-learn
Version: 0.10.1
Summary: Toolbox for imbalanced dataset in machine learning.
Home-page: https://github.com/scikit-learn-contrib/imbalanced-learn
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: joblib, numpy, scikit-learn, scipy, threadpoolctl
Required-by: 


In [4]:
pip show scikit-learn

Name: scikit-learn
Version: 1.2.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /usr/local/lib/python3.10/dist-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: bigframes, fastai, imbalanced-learn, librosa, mlxtend, qudida, sklearn-pandas, yellowbrick


In [5]:
from imblearn.over_sampling import SMOTE

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
data = pd.read_csv('/content/drive/MyDrive/NLPFinalProject/Model Implementation/imbalancedtask1.csv')

In [14]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

lemmatizer = WordNetLemmatizer()

def preprocess_text_lemmatize(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

data['chat1_processed'] = data['chat1'].apply(preprocess_text_lemmatize)
data['chat2_processed'] = data['chat2'].apply(preprocess_text_lemmatize)

# Using a more complex model like XGBoost
from xgboost import XGBClassifier
vectorizer = TfidfVectorizer()
# Extract features from the preprocessed text data
X = vectorizer.fit_transform(data['chat1_processed'] + ' ' + data['chat2_processed'])

import pickle
with open('vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Using SMOTE for oversampling
from imblearn.over_sampling import SMOTE

# Perform oversampling before splitting into training and test sets
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, data['label'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Use GridSearchCV for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5,],
    'colsample_bytree': [0.5],
    'n_estimators' : [100],
    'objective': ['binary:logistic']
}

# Initialize the classifier
xgb = XGBClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, scoring='f1', verbose=2, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters: ", best_params)

# Train a XGBClassifier model on the training set with the best parameters
xgb_best = XGBClassifier(**best_params)
xgb_best.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_best.predict(X_test)

# Evaluate the performance of the model
print('Accuracy:', accuracy_score(y_test, y_pred_xgb))
print('Precision:', precision_score(y_test, y_pred_xgb))
print('Recall:', recall_score(y_test, y_pred_xgb))
print('F1 score:', f1_score(y_test, y_pred_xgb))


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best parameters:  {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 100, 'objective': 'binary:logistic', 'subsample': 0.5}
Accuracy: 0.896969696969697
Precision: 0.9283489096573209
Recall: 0.8688046647230321
F1 score: 0.8975903614457832


In [11]:
y_pred_xgb

array([1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,

In [12]:
X_test

<660x42609 sparse matrix of type '<class 'numpy.float64'>'
	with 238924 stored elements in Compressed Sparse Row format>

In [15]:
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))


Classification Report:
               precision    recall  f1-score   support

       False       0.87      0.93      0.90       317
        True       0.93      0.87      0.90       343

    accuracy                           0.90       660
   macro avg       0.90      0.90      0.90       660
weighted avg       0.90      0.90      0.90       660



In [16]:
import pickle

filename = 'XBBOOST_Imbalanced.pkl'
pickle.dump(xgb_best, open(filename, 'wb'))

In [17]:
# Example text input
text_input = "china and japan's budding relationship in the time of coronavirus. the virus has been linked to a series deaths from pneumonia since last year but no cases have yet shown up outside asia or even within china. it is thought that it could be spread by people who come into contact with infected animals such as monkeys at zoos around asia. but experts say they are still trying determine how far this disease can spread. they said there was little evidence so far, however, suggesting any link between humans being bitten while on holiday abroad and, say, an outbreak here. and many tourists visiting hong kong were not infected. a chinese tourist died after returning home earlier than expected. in japan, where two japanese men fell ill shortly before christmas, authorities ordered all visitors staying over for christmas holidays away until further notice. some hotels also closed their doors early because guests had already left. there appeared today only one case among those infected, which occurred when someone returned late saturday night -- about eight hours later - bringing back his own travel bag. he did not, though, show symptoms himself. authorities believe he contracted sars during overseas travel. at least three other travelers came down sick there. one man died, another recovered fully, according tokyo hospitals. all five patients worked overseas, including four americans working abroad. another american woman got sick. officials don't know if she caught it. japan will test more hotel rooms next week ahead of, possibly, additional cases. most foreign passengers arriving here stay overnight anyway, officials said. we do everything we can, yoshiaki okamoto, head doctor general told reporters. we're doing our best. health minister yukio edano onaga visited shanghai today. his visit comes amid concerns china's new measures against possible outbreak. government ban announced friday morning flight cancellations due partly blamed flu epidemic scare caused flights canceled yesterday. mr. offered yesterday afternoon beijing airport cancellation yesterday, canceling some airlines cancelled hundreds left thousands stranded overseas. on thursday, air traffic delays delayed arrival tuesday, local media reports suggest few days ago cancel flights, leaving millions without explanation. two u.s. secretary general motors chief executive says airline executives made public health official announcement monday morning. with most likely would like safety issues affecting united states department spokesman called tuesday may 26 march 8 a.m. flight 370 million tickets canceled. even though boeing 787 scheduled departure april 4 p.m. this week, delta air france airlines jet planes grounded its main carrier flies out 1 june 6, july 21 september 11, plane crash involving korean airlines. what you can't fly feb. 19 august 15, 2009 world trade center san francisco airport new york city international airport's arrival. passengers wait list february 6 billion dollar trade center boston area 2 day 15 years ago. if anyone sent washington 11 december 12, 2003 america must now available january 2005 10 miles per capita economic crisis 5 november 29 october 10, sept. 11. where does not. as"
# Preprocess the text input using the same preprocessing function
text_input_processed = preprocess_text_lemmatize(text_input)

# Vectorize the preprocessed text input
text_input_vectorized = vectorizer.transform([text_input_processed])

# Use the trained XGBoost model to make a prediction
prediction = xgb_best.predict(text_input_vectorized)

# Print the prediction
print("Prediction for the text input:", prediction[0])

Prediction for the text input: 0


In [18]:
prediction

array([0])

In [19]:
data['chat1'][5]

"china and japan's budding relationship in the time of coronavirus. the virus has been linked to a series deaths from pneumonia since last year but no cases have yet shown up outside asia or even within china. it is thought that it could be spread by people who come into contact with infected animals such as monkeys at zoos around asia. but experts say they are still trying determine how far this disease can spread. they said there was little evidence so far, however, suggesting any link between humans being bitten while on holiday abroad and, say, an outbreak here. and many tourists visiting hong kong were not infected. a chinese tourist died after returning home earlier than expected. in japan, where two japanese men fell ill shortly before christmas, authorities ordered all visitors staying over for christmas holidays away until further notice. some hotels also closed their doors early because guests had already left. there appeared today only one case among those infected, which oc