In [2]:
import pandas as pd
import numpy as np
from keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report   

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = load_model('model_big.h5')
new_data = pd.read_csv('cleaned_trump_tweets.csv')
new_data_cleaned = new_data.dropna(subset=['content'])

tokenizer = Tokenizer(num_words=500, split=' ') 
tokenizer.fit_on_texts(new_data_cleaned['content'].values)
X_new = tokenizer.texts_to_sequences(new_data_cleaned['content'].values)
X_new = pad_sequences(X_new, maxlen=model.input_shape[1])

predictions = model.predict(X_new)
sentiment_labels = ['Negative', 'Neutral', 'Positive']
predicted_labels = [sentiment_labels[np.argmax(pred)] for pred in predictions]

# Add predictions to the dataframe
new_data_cleaned['Predicted_Sentiment'] = predicted_labels

new_data_cleaned.to_csv('predicted_sentiments.csv', index=False)
print(new_data_cleaned[['content', 'Predicted_Sentiment']].head())




[1m1352/1352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m872s[0m 640ms/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data_cleaned['Predicted_Sentiment'] = predicted_labels


                                             content Predicted_Sentiment
0  sure tune watch donald trump late night david ...             Neutral
1  donald trump appearing view tomorrow morning d...             Neutral
2  donald trump reads top ten financial tips late...             Neutral
3  new blog post celebrity apprentice finale less...            Negative
4  persona never wallflower rather build walls cl...             Neutral


In [4]:
X=new_data_cleaned['content']
y=new_data_cleaned['Predicted_Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
clf=RandomForestClassifier()
clf.fit(X_train_tfidf, y_train)

y_pred=clf.predict(X_test_tfidf)

print(classification_report(y_pred, y_test))



              precision    recall  f1-score   support

    Negative       0.76      0.86      0.81      1528
     Neutral       0.88      0.90      0.89      4268
    Positive       0.96      0.86      0.90      2855

    accuracy                           0.88      8651
   macro avg       0.87      0.87      0.87      8651
weighted avg       0.88      0.88      0.88      8651



In [1]:
X_test_sample = X_test_tfidf[:10]
X_test_sample_dense = X_test_sample.toarray()

explainer = shap.TreeExplainer(clf)

shap_values = explainer.shap_values(X_test_sample_dense)

shap.initjs()
shap.summary_plot(shap_values, X_test_tfidf, feature_names=vectorizer.get_feature_names_out())

NameError: name 'X_test_tfidf' is not defined