In [1]:
from pymongo import MongoClient
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

In [2]:
client = MongoClient("mongodb://localhost:27017/")

In [3]:
db = client["PythonForDs"]
collection = db["isw_report"]

In [4]:
cursor = collection.find()

In [5]:
df = pd.DataFrame(list(cursor))

Default operations to check dataset:

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1123 entries, 0 to 1122
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   _id             1123 non-null   object        
 1   date            1123 non-null   datetime64[ns]
 2   extracted_text  1123 non-null   object        
dtypes: datetime64[ns](1), object(2)
memory usage: 26.4+ KB


In [7]:
df.describe()

Unnamed: 0,date
count,1123
mean,2023-09-19 19:34:34.087266048
min,2022-02-25 00:00:00
25%,2022-12-12 12:00:00
50%,2023-09-21 00:00:00
75%,2024-06-29 12:00:00
max,2025-04-08 00:00:00


In [8]:
df.isnull().sum()

_id               0
date              0
extracted_text    0
dtype: int64

In [9]:
df.head()

Unnamed: 0,_id,date,extracted_text
0,67f568f9dcd7a5c401c30a6b,2022-02-25,Russian forces entered major Ukrainian cities—...
1,67f568f9dcd7a5c401c30a6c,2022-02-26,Russian forces’ main axes of advance in the la...
2,67f568f9dcd7a5c401c30a6d,2022-02-27,The Russian military has likely recognized tha...
3,67f568f9dcd7a5c401c30a6e,2022-02-28,The Russian military is reorganizing its milit...
4,67f568f9dcd7a5c401c30a6f,2022-03-01,Russian forces are completing the reinforcemen...


In [10]:
nlp = spacy.load("en_core_web_sm")
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# Create a new column 'lemmatized_text'
df['lemmatized_text'] = df['extracted_text'].apply(lemmatize_text)



KeyboardInterrupt: 

Visualization of basic parameters:

In [None]:
df['text_length'] = df['extracted_text'].apply(len)
plt.figure(figsize=(14, 6))
sns.histplot(df['text_length'], bins=50, kde=True, color='teal')
plt.title('Distribution of Text Length in ISW Reports')
plt.xlabel('Text Length (characters)')
plt.ylabel('Count')
plt.axvline(df['text_length'].mean(), color='red', linestyle='--', 
           label=f'Mean: {df["text_length"].mean():.2f}')
plt.axvline(df['text_length'].median(), color='green', linestyle='--', 
           label=f'Median: {df["text_length"].median():.2f}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('isw_text_length.png', dpi=300)
plt.show()

In [None]:
plt.figure(figsize=(16, 6))
df['year_month'] = df['date'].dt.to_period('M')
monthly_counts = df.groupby('year_month').size()
monthly_counts.index = monthly_counts.index.astype(str)
monthly_counts.plot(kind='bar', color='darkgreen')
plt.title('Number of ISW Reports by Month')
plt.xlabel('Month')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('isw_reports_by_month.png', dpi=300)
plt.show()

In [None]:
df['day_of_week'] = df['date'].dt.day_name()
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_counts = df['day_of_week'].value_counts().reindex(day_order)

plt.figure(figsize=(14, 6))
sns.barplot(x=day_counts.index, y=day_counts.values, palette='Blues_d')
plt.title('Distribution of ISW Reports by Day of Week', fontsize=16)
plt.xlabel('Day of Week', fontsize=14)
plt.ylabel('Number of Reports', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('isw_reports_by_day.png', dpi=300)
plt.show()

Working with text:

In [None]:
df['text_length'] = df['extracted_text'].apply(len)
df['word_count'] = df['extracted_text'].apply(lambda x: len(str(x).split()))

# Text length statistics
print("\nText length statistics:")
print(f"Mean text length: {df['text_length'].mean():.2f} characters")
print(f"Median text length: {df['text_length'].median():.2f} characters")
print(f"Min text length: {df['text_length'].min()} characters")
print(f"Max text length: {df['text_length'].max()} characters")
print(f"Mean word count: {df['word_count'].mean():.2f} words")
print(f"Median word count: {df['word_count'].median():.2f} words")

In [None]:
plt.figure(figsize=(14, 6))
sns.histplot(df['word_count'], bins=50, kde=True, color='darkgreen')
plt.title('Distribution of Word Count in ISW Reports', fontsize=16)
plt.xlabel('Word Count', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.axvline(df['word_count'].mean(), color='red', linestyle='--', 
           label=f'Mean: {df["word_count"].mean():.2f}')
plt.axvline(df['word_count'].median(), color='blue', linestyle='--', 
           label=f'Median: {df["word_count"].median():.2f}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('isw_word_count_distribution.png', dpi=300)
plt.show()

because lack of reports in first months, we have problems in graphs:

In [None]:
plt.figure(figsize=(16, 6))
text_length_by_date = df.groupby(df['date'].dt.date)['text_length'].mean()
text_length_by_date.plot(color='purple', linewidth=2)
plt.title('Average Text Length Over Time', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Average Text Length (characters)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('isw_text_length_over_time.png', dpi=300)
plt.show()

In [None]:
vectorizer = TfidfVectorizer(max_features=20, stop_words='english')
X_tfidf = vectorizer.fit_transform(df['extracted_text'])
tfidf_scores = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_scores.mean().sort_values(ascending=False).plot(kind='bar', figsize=(12,6))
plt.title("Top tf-idf in ISW reports")
plt.ylabel("avg meaning tf-idf")
plt.show()


In [None]:
keywords = ["airstrike", "missile", "Belarus", "drone", "air defense", "infrastructure", "bomb"]
for word in keywords:
    df[f'has_{word}'] = df['extracted_text'].str.lower().str.contains(word)

In [None]:
df.head()

In [None]:
mentions_over_time = df.groupby('date')[[f'has_{w}' for w in keywords]].sum()

In [None]:
mentions_over_time