  
  IMPORTING LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import re #regular expression
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
from nltk.corpus import stopwords

PREPROCESSING

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
twitter_data = pd.read_csv('data/training.1600000.processed.noemoticon.csv', encoding = 'ISO-8859-1')

In [5]:
twitter_data.shape

(1599999, 6)

In [6]:
twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [7]:
column_names = ['target','id','date','flag','user','text']
twitter_data = pd.read_csv('data/training.1600000.processed.noemoticon.csv',names = column_names, encoding = 'ISO-8859-1')

In [8]:
twitter_data.shape

(1600000, 6)

In [9]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [10]:
twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [11]:
twitter_data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [12]:
twitter_data.replace({'target':{4:1}}, inplace=True)

In [13]:
twitter_data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

STEMMING

In [14]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from joblib import Parallel, delayed, parallel_backend
import multiprocessing

port_stem = PorterStemmer()

def stemming(content, stop_words):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stop_words]
    stemmed_content = ' '.join(stemmed_content)

    return stemmed_content

def apply_stemming_parallel(data, stop_words, n_jobs=-1):
    with parallel_backend('loky', n_jobs=n_jobs):
        return Parallel()(delayed(stemming)(content, stop_words) for content in data)

num_cores = multiprocessing.cpu_count()
chunk_size = 1000
chunks = [twitter_data['text'][i:i+chunk_size] for i in range(0, len(twitter_data['text']), chunk_size)]

stop_words = set(stopwords.words('english'))

stemmed_contents = Parallel(n_jobs=num_cores)(delayed(apply_stemming_parallel)(chunk, stop_words, n_jobs=-1) for chunk in chunks)

stemmed_contents = [item for sublist in stemmed_contents for item in sublist]

twitter_data['stemmed_content'] = stemmed_contents

In [15]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [16]:
print(twitter_data['stemmed_content'])

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_content, Length: 1600000, dtype: object


In [17]:
print(twitter_data['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


TFIDF VECTORIZATION

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import time
from joblib import parallel_backend

X = twitter_data['stemmed_content'].values
y = twitter_data['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=2**18)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

SVM

In [23]:
svm_classifier = LinearSVC(C=0.1, dual=False, max_iter=1000)

# Training
start_time = time.time()
with parallel_backend('threading'):
    svm_classifier.fit(X_train_vectorized, y_train)
svm_train_time = time.time() - start_time

# Cross-validation
cv_start_time = time.time()
cv_scores = cross_val_score(svm_classifier, X_train_vectorized, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_time = time.time() - cv_start_time

# Predictions on test set
start_time = time.time()
svm_test_predictions = svm_classifier.predict(X_test_vectorized)
test_time = time.time() - start_time

# Evaluation on test set
svm_accuracy_test = accuracy_score(y_test, svm_test_predictions)
svm_precision_test = precision_score(y_test, svm_test_predictions)
svm_recall_test = recall_score(y_test, svm_test_predictions)
svm_f1_test = f1_score(y_test, svm_test_predictions)
svm_auc_roc_test = roc_auc_score(y_test, svm_test_predictions)

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

print("\nTest Metrics:")
print("Training Time:", svm_train_time)
print("Test Time:", test_time)
print("Accuracy:", svm_accuracy_test)
print("Precision:", svm_precision_test)
print("Recall:", svm_recall_test)
print("F1 Score:", svm_f1_test)
print("Area Under ROC Curve:", svm_auc_roc_test)


Cross-Validation Scores: [0.78871484 0.78938281 0.79153516 0.78776172 0.79020313]
Mean CV Accuracy: 0.7895195312500001

Test Metrics:
Training Time: 22.712084531784058
Test Time: 0.10026741027832031
Accuracy: 0.791153125
Precision: 0.7797380636405671
Recall: 0.81155625
F1 Score: 0.7953290519371939
Area Under ROC Curve: 0.7911531250000001


LOGISTIC REGRESSION


In [24]:
# Define and configure the logistic regression classifier
logistic_classifier = LogisticRegression(max_iter=1000)

# Training
start_time = time.time()
with parallel_backend('threading'):
    logistic_classifier.fit(X_train_vectorized, y_train)
logistic_train_time = time.time() - start_time

# Cross-validation
cv_start_time = time.time()
cv_scores = cross_val_score(logistic_classifier, X_train_vectorized, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_time = time.time() - cv_start_time

# Predictions on test set
start_time = time.time()
logistic_test_predictions = logistic_classifier.predict(X_test_vectorized)
test_time = time.time() - start_time

# Evaluation on test set
logistic_accuracy_test = accuracy_score(y_test, logistic_test_predictions)
logistic_precision_test = precision_score(y_test, logistic_test_predictions)
logistic_recall_test = recall_score(y_test, logistic_test_predictions)
logistic_f1_test = f1_score(y_test, logistic_test_predictions)
logistic_auc_roc_test = roc_auc_score(y_test, logistic_test_predictions)

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

print("\nTest Metrics:")
print("Training Time:", logistic_train_time)
print("Test Time:", test_time)
print("Accuracy:", logistic_accuracy_test)
print("Precision:", logistic_precision_test)
print("Recall:", logistic_recall_test)
print("F1 Score:", logistic_f1_test)
print("Area Under ROC Curve:", logistic_auc_roc_test)

Cross-Validation Scores: [0.78878906 0.78968359 0.79185938 0.78778516 0.79057813]
Mean CV Accuracy: 0.7897390625

Test Metrics:
Training Time: 116.19415378570557
Test Time: 0.07979011535644531
Accuracy: 0.79156875
Precision: 0.7819132221416485
Recall: 0.80869375
F1 Score: 0.7950780385891606
Area Under ROC Curve: 0.7915687499999999


NAIVE BAYES

In [25]:
from sklearn.naive_bayes import MultinomialNB
# Define and configure the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Training
start_time = time.time()
with parallel_backend('threading'):
    nb_classifier.fit(X_train_vectorized, y_train)
nb_train_time = time.time() - start_time

# Cross-validation
cv_start_time = time.time()
cv_scores = cross_val_score(nb_classifier, X_train_vectorized, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_time = time.time() - cv_start_time


# Predictions on test set
start_time = time.time()
nb_test_predictions = nb_classifier.predict(X_test_vectorized)
test_time = time.time() - start_time

# Evaluation on test set
nb_accuracy_test = accuracy_score(y_test, nb_test_predictions)
nb_precision_test = precision_score(y_test, nb_test_predictions)
nb_recall_test = recall_score(y_test, nb_test_predictions)
nb_f1_test = f1_score(y_test, nb_test_predictions)
nb_auc_roc_test = roc_auc_score(y_test, nb_test_predictions)

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

print("\nTest Metrics:")
print("Training Time:", nb_train_time)
print("Test Time:", test_time)
print("Accuracy:", nb_accuracy_test)
print("Precision:", nb_precision_test)
print("Recall:", nb_recall_test)
print("F1 Score:", nb_f1_test)
print("Area Under ROC Curve:", nb_auc_roc_test)

Cross-Validation Scores: [0.77157031 0.77291406 0.7763125  0.77202734 0.77311328]
Mean CV Accuracy: 0.7731875

Test Metrics:
Training Time: 0.41886210441589355
Test Time: 0.08178091049194336
Accuracy: 0.774628125
Precision: 0.7748222181915978
Recall: 0.774275
F1 Score: 0.7745485124434566
Area Under ROC Curve: 0.774628125


ENSEMBLE LEARNING

In [26]:

from sklearn.ensemble import VotingClassifier

# Define the ensemble classifier
ensemble_classifier = VotingClassifier(estimators=[
    ('svm', svm_classifier),
    ('logistic', logistic_classifier),
    ('nb', nb_classifier)
], voting='hard')

# Training
start_time = time.time()
with parallel_backend('threading', n_jobs=4):
    ensemble_classifier.fit(X_train_vectorized, y_train)
ensemble_train_time = time.time() - start_time

# Cross-validation
cv_start_time = time.time()
cv_scores = cross_val_score(ensemble_classifier, X_train_vectorized, y_train, cv=5, scoring='accuracy', n_jobs=4)
cv_time = time.time() - cv_start_time


# Predictions on test set
start_time = time.time()
ensemble_test_predictions = ensemble_classifier.predict(X_test_vectorized)
test_time = time.time() - start_time

# Evaluation on test set
ensemble_accuracy_test = accuracy_score(y_test, ensemble_test_predictions)
ensemble_precision_test = precision_score(y_test, ensemble_test_predictions)
ensemble_recall_test = recall_score(y_test, ensemble_test_predictions)
ensemble_f1_test = f1_score(y_test, ensemble_test_predictions)
ensemble_auc_roc_test = roc_auc_score(y_test, ensemble_test_predictions)

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

print("\nTest Metrics:")
print("Training Time:", ensemble_train_time)
print("Test Time:", test_time)
print("Accuracy:", ensemble_accuracy_test)
print("Precision:", ensemble_precision_test)
print("Recall:", ensemble_recall_test)
print("F1 Score:", ensemble_f1_test)
print("Area Under ROC Curve:", ensemble_auc_roc_test)


Cross-Validation Scores: [0.78876953 0.78977734 0.79167969 0.78774219 0.79032813]
Mean CV Accuracy: 0.789659375

Test Metrics:
Training Time: 126.65012764930725
Test Time: 1.8575878143310547
Accuracy: 0.791178125
Precision: 0.7807026528730923
Recall: 0.8098375
F1 Score: 0.7950032364842271
Area Under ROC Curve: 0.791178125


RANDOM FOREST

In [27]:
from sklearn.ensemble import RandomForestClassifier

# Define and configure the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=400, max_depth=20, n_jobs=-1, random_state=42)

# Training
start_time = time.time()
with parallel_backend('threading'):
    rf_classifier.fit(X_train_vectorized, y_train)
rf_train_time = time.time() - start_time

# Cross-validation
cv_start_time = time.time()
cv_scores = cross_val_score(rf_classifier, X_train_vectorized, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_time = time.time() - cv_start_time

# Predictions on test set
start_time = time.time()
rf_test_predictions = rf_classifier.predict(X_test_vectorized)
test_time = time.time() - start_time

# Evaluation on test set
rf_accuracy_test = accuracy_score(y_test, rf_test_predictions)
rf_precision_test = precision_score(y_test, rf_test_predictions)
rf_recall_test = recall_score(y_test, rf_test_predictions)
rf_f1_test = f1_score(y_test, rf_test_predictions)
rf_auc_roc_test = roc_auc_score(y_test, rf_test_predictions)

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

print("\nTest Metrics:")
print("Training Time:", rf_train_time)
print("Test Time:", test_time)
print("Accuracy:", rf_accuracy_test)
print("Precision:", rf_precision_test)
print("Recall:", rf_recall_test)
print("F1 Score:", rf_f1_test)
print("Area Under ROC Curve:", rf_auc_roc_test)

Cross-Validation Scores: [0.74352734 0.74450781 0.74285156 0.74138672 0.74166406]
Mean CV Accuracy: 0.7427874999999999

Test Metrics:
Training Time: 214.9751410484314
Test Time: 18.821688175201416
Accuracy: 0.743390625
Precision: 0.7293446957873722
Recall: 0.7740125
F1 Score: 0.7510150121740817
Area Under ROC Curve: 0.743390625


In [28]:
import joblib

In [29]:
#Evaluate models on the test set
models = {
    'SVM' : svm_classifier,
    'Logistic Regression' : logistic_classifier,
    'Naive Bayes' : nb_classifier,
    'Random Forest' : rf_classifier,
    'Ensemble' : ensemble_classifier
}

In [30]:
for name, model in models.items():
    start_time = time.time()
    model.fit(X_train_vectorized, y_train)
    training_time = time.time() - start_time
    
    predictions = model.predict(X_test_vectorized)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    f1 = f1_score(y_test, predictions, average='weighted')
    auc = roc_auc_score(y_test, predictions, average='weighted', multi_class='ovr')
    
    print(f"{name}:")
    print(f"  Training Time: {training_time} seconds")
    print(f"  Accuracy: {accuracy*100}%")
    print(f"  Precision: {precision}")
    print(f"  Recall: {recall}")
    print(f"  F1 Score: {f1}")
    print(f"  AUC: {auc}")
    print()


SVM:
  Training Time: 25.471007823944092 seconds
  Accuracy: 79.1153125%
  Precision: 0.7916387472714111
  Recall: 0.791153125
  F1 Score: 0.7910661484472313
  AUC: 0.7911531250000001

Logistic Regression:
  Training Time: 178.20142936706543 seconds
  Accuracy: 79.156875%
  Precision: 0.7919111800586576
  Recall: 0.79156875
  F1 Score: 0.7915076063478679
  AUC: 0.7915687499999999

Naive Bayes:
  Training Time: 0.47132253646850586 seconds
  Accuracy: 77.4628125%
  Precision: 0.7746282619815733
  Recall: 0.774628125
  F1 Score: 0.7746280968967398
  AUC: 0.774628125

Random Forest:
  Training Time: 237.92444109916687 seconds
  Accuracy: 74.3390625%
  Precision: 0.7443069708322817
  Recall: 0.743390625
  F1 Score: 0.7431497767438087
  AUC: 0.743390625

Ensemble:
  Training Time: 158.98287153244019 seconds
  Accuracy: 79.1178125%
  Precision: 0.7915842111530605
  Recall: 0.791178125
  F1 Score: 0.7911053936896042
  AUC: 0.791178125



In [31]:
# Initialize a dictionary to store performance metrics
performance_metrics = {
    'Model': [],
    'Training Time': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': [],
    'AUC': []
}

# Iterate over each model
models = {
    'SVM' : svm_classifier,
    'Logistic Regression' : logistic_classifier,
    'Naive Bayes' : nb_classifier,
    'Random Forest' : rf_classifier,
    'Ensemble' : ensemble_classifier
}

for name, model in models.items():
    
    start_time = time.time()
    model.fit(X_train_vectorized, y_train)
    training_time = time.time() - start_time
    
    # Make predictions
    predictions = model.predict(X_test_vectorized)
    
    # Compute performance metrics
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    f1 = f1_score(y_test, predictions, average='weighted')
    auc = roc_auc_score(y_test, predictions, average='weighted', multi_class='ovr')
    
    print(f"{name}:")
    print(f"  Training Time: {training_time} seconds")
    print(f"  Accuracy: {accuracy*100}%")
    print(f"  Precision: {precision}")
    print(f"  Recall: {recall}")
    print(f"  F1 Score: {f1}")
    print(f"  AUC: {auc}")
    print()
    
    # Add metrics to the dictionary
    performance_metrics['Model'].append(name)
    performance_metrics['Training Time'].append(training_time)
    performance_metrics['Accuracy'].append(accuracy*100)
    performance_metrics['Precision'].append(precision)
    performance_metrics['Recall'].append(recall)
    performance_metrics['F1 Score'].append(f1)
    performance_metrics['AUC'].append(auc)

# Convert dictionary to DataFrame
metrics_df = pd.DataFrame(performance_metrics)

# Save DataFrame to CSV file
metrics_df.to_csv('twitter_performance_metrics.csv', index=False)

SVM:
  Training Time: 23.858806848526 seconds
  Accuracy: 79.1153125%
  Precision: 0.7916387472714111
  Recall: 0.791153125
  F1 Score: 0.7910661484472313
  AUC: 0.7911531250000001

Logistic Regression:
  Training Time: 147.02249765396118 seconds
  Accuracy: 79.156875%
  Precision: 0.7919111800586576
  Recall: 0.79156875
  F1 Score: 0.7915076063478679
  AUC: 0.7915687499999999

Naive Bayes:
  Training Time: 0.5826525688171387 seconds
  Accuracy: 77.4628125%
  Precision: 0.7746282619815733
  Recall: 0.774628125
  F1 Score: 0.7746280968967398
  AUC: 0.774628125

Random Forest:
  Training Time: 253.37848567962646 seconds
  Accuracy: 74.3390625%
  Precision: 0.7443069708322817
  Recall: 0.743390625
  F1 Score: 0.7431497767438087
  AUC: 0.743390625

Ensemble:
  Training Time: 203.08208203315735 seconds
  Accuracy: 79.1178125%
  Precision: 0.7915842111530605
  Recall: 0.791178125
  F1 Score: 0.7911053936896042
  AUC: 0.791178125



In [32]:
import multiprocessing as mp

In [33]:
print(mp.cpu_count())

8


In [34]:
from sklearn.metrics import confusion_matrix

In [35]:
# Initialize a dictionary to store confusion matrices
confusion_matrices = {}

for name, model in models.items():
    # Make predictions
    predictions = model.predict(X_test_vectorized)
    
    # Compute confusion matrix
    cm = confusion_matrix(y_test, predictions)
    
    # Store confusion matrix in the dictionary
    confusion_matrices[name] = cm
    
    # Save confusion matrix to a pickle file
    filename = f'twitter_{name}_cm.pkl'
    joblib.dump(cm, filename)

In [36]:
from sklearn.metrics import roc_curve

In [37]:
# Initialize a dictionary to store ROC curve data
roc_curve_data = {}

# Iterate over each model
for name, model in models.items():
    # Make predictions
    predictions = model.predict(X_test_vectorized)
    
    # Compute false positive rate, true positive rate, and thresholds
    fpr, tpr, thresholds = roc_curve(y_test, predictions)
    
    # Store ROC curve data in the dictionary
    roc_curve_data[name] = {'fpr': fpr, 'tpr': tpr}

# Save ROC curve data to a Joblib file
joblib.dump(roc_curve_data, 'twitter_roc_curve_data.joblib')

['twitter_roc_curve_data.joblib']

In [2]:
import joblib
import multiprocessing
num_cores = multiprocessing.cpu_count()
print(num_cores)

8
