In [1]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import requests

In [2]:
Russiawiki=requests.get("https://en.wikipedia.org/wiki/Russia")

In [3]:
from bs4 import BeautifulSoup
RussiaDf = BeautifulSoup(Russiawiki.content, "lxml")

In [4]:
RussiaText = RussiaDf.getText(strip=True)

In [5]:
for oprn in [r'\[\d+\]', r'\(\d+\)', r'\(\w+\)', r'\[\w+\]', r'[0-9]+', "\xa0°C"]:
    RussiaText = re.sub(oprn, " ", RussiaText)

In [6]:
from nltk.tokenize import sent_tokenize

In [7]:
RussiaSentances = sent_tokenize(RussiaText)

In [8]:
RussiaSentences = pd.DataFrame(RussiaSentances, columns=['sentence'])

In [9]:
from textblob import TextBlob
s1 = TextBlob("The English name Russia first appeared in the 14th century, borrowed from Medieval Latin")

In [10]:
s1.sentiment

Sentiment(polarity=0.08333333333333333, subjectivity=0.1111111111111111)

In [11]:
def analyze_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity>0:
        return "Positive"
    elif analysis.sentiment.polarity==0:
        return "Neutral"
    else:
        return "Negative"

In [12]:
RussiaSentences['sentiment'] = [str(analyze_sentiment(x)) for x in RussiaSentences.sentence]

In [13]:
RussiaSentences['sentiment'].value_counts()

sentiment
Neutral     1954
Positive     619
Negative     190
Name: count, dtype: int64

In [14]:
from nltk.tokenize import word_tokenize

RussiaWords = word_tokenize(RussiaText)

In [15]:
RussiaWords = [w.lower() for w in RussiaWords]

In [16]:
RussiaWords = [w for w in RussiaWords if w.isalnum()]

In [17]:
from nltk.corpus import stopwords
english_stopwords = set(stopwords.words("english"))

In [18]:
RussiaWords = [w for w in RussiaWords if not w in english_stopwords]

In [19]:
RussiaWords = [w for w in RussiaWords if len(w)>2]

In [20]:
from nltk.probability import FreqDist
wordfreq = FreqDist(RussiaWords)

In [21]:
wordfreq. most_common(20)

[('russia', 482),
 ('retrieved', 388),
 ('russian', 329),
 ('world', 138),
 ('original', 136),
 ('march', 129),
 ('january', 128),
 ('soviet', 110),
 ('war', 108),
 ('april', 98),
 ('may', 96),
 ('june', 95),
 ('july', 88),
 ('press', 77),
 ('history', 75),
 ('february', 74),
 ('august', 70),
 ('cid', 69),
 ('university', 68),
 ('country', 67)]

In [22]:
from wordcloud import WordCloud

In [23]:
Russia_Words = ("").join(RussiaWords)

In [88]:
# wordcloud = WordCloud(width = 1000, height = 500, stopwords = "english_stopwords",max_words=100, colormap="plasma", collocations=False).generate(str(Russia_Words))

In [90]:
# plt.figure(figsize=(20,10))
# plt.imshow(wordcloud)
# plt.show()

In [26]:
RussiaSentences

Unnamed: 0,sentence,sentiment
0,Russia - WikipediaJump to contentMain menuMain...,Neutral
1,Early history .,Positive
2,Kievan Rus' .,Neutral
3,Grand Principality of Moscow .,Positive
4,Tsardom of Russia .,Neutral
...,...,...
2758,Oceanic islandswithin the vicinity of Europe a...,Negative
2759,Governed by theHoly Seewhich has sovereignty o...,Positive
2760,License;\nadditional terms may apply.,Neutral
2761,"By using this site, you agree to theTerms of U...",Neutral


In [27]:
RussiaSentences = RussiaSentences[RussiaSentences["sentiment"] != "Neutral"]

In [28]:
RussiaSentences["sentiment"] = RussiaSentences["sentiment"].map({"Negative": 0, "Positive": 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  RussiaSentences["sentiment"] = RussiaSentences["sentiment"].map({"Negative": 0, "Positive": 1})


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    RussiaSentences["sentence"], RussiaSentences["sentiment"], 
    test_size=0.20, random_state=42
)

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

In [32]:
tfidf = TfidfVectorizer(max_features=1000, stop_words="english", ngram_range=(1,1))
X_train_tfidf = tfidf.fit_transform(X_train)  
X_test_tfidf = tfidf.transform(X_test)  

In [33]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

In [34]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_resampled, y_train_resampled)  # Train on resampled data
y_pred_lr = lr.predict(X_test_tfidf)  # Use test data (no resampling!)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.8148148148148148
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.62      0.62        40
           1       0.88      0.88      0.88       122

    accuracy                           0.81       162
   macro avg       0.75      0.75      0.75       162
weighted avg       0.81      0.81      0.81       162



In [35]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [36]:
dt = DecisionTreeClassifier(max_depth=9,splitter="best")
dt.fit(X_train_resampled, y_train_resampled)
y_pred_dt = dt.predict(X_test_tfidf)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))

Decision Tree Accuracy: 0.7839506172839507
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.20      0.31        40
           1       0.79      0.98      0.87       122

    accuracy                           0.78       162
   macro avg       0.76      0.59      0.59       162
weighted avg       0.77      0.78      0.73       162



In [37]:
rf = RandomForestClassifier()
rf.fit(X_train_resampled, y_train_resampled)
y_pred_rf = rf.predict(X_test_tfidf)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.8518518518518519
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.47      0.61        40
           1       0.85      0.98      0.91       122

    accuracy                           0.85       162
   macro avg       0.86      0.73      0.76       162
weighted avg       0.85      0.85      0.84       162



In [38]:
gb = GradientBoostingClassifier()
gb.fit(X_train_resampled, y_train_resampled)
y_pred_gb = gb.predict(X_test_tfidf)
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Classification Report:\n", classification_report(y_test, y_pred_gb))

Gradient Boosting Accuracy: 0.8271604938271605
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.53      0.60        40
           1       0.86      0.93      0.89       122

    accuracy                           0.83       162
   macro avg       0.78      0.73      0.74       162
weighted avg       0.82      0.83      0.82       162



In [39]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train_resampled, y_train_resampled)
y_pred_knn = knn.predict(X_test_tfidf)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))

KNN Accuracy: 0.7283950617283951
Classification Report:
               precision    recall  f1-score   support

           0       0.40      0.20      0.27        40
           1       0.77      0.90      0.83       122

    accuracy                           0.73       162
   macro avg       0.59      0.55      0.55       162
weighted avg       0.68      0.73      0.69       162



In [40]:
import pickle

# ✅ Save the trained TF-IDF vectorizer
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

# ✅ Save the best trained model (Logistic Regression)
with open("sentiment_model.pkl", "wb") as f:
    pickle.dump(rf, f)

print("TF-IDF Vectorizer & Sentiment Model saved successfully!")

TF-IDF Vectorizer & Sentiment Model saved successfully!
