In [None]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import re
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
# Load your dataset
df = pd.read_csv('datasets/training_data.csv')

# Mapping dictionary
label_mapping = {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}

# Replace numerical labels with words
df['label'] = df['label'].map(label_mapping)

df.head(7)

## Class Distribution

The dataset is perfectly balanced, as each label has the same amount of samples, exacly 30000 news per type.

In [None]:
# Class Distribution
class_distribution = df['label'].value_counts()
print("Class Distribution:\n", class_distribution)

# Exploratory Data Visualization
plt.figure(figsize=(8, 4))
df['label'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()

## Top Words by Label
Identify the top TF-IDF weighted words for each label. This can give us insights into the most distinctive words for each category.

In [None]:
ps = PorterStemmer()
sw = set(stopwords.words('english'))

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words="english")

labels = df['label'].unique()

# Create a dictionary to store the top 10 words for each label
top_words_by_label = {label: [] for label in labels}

for label in labels:

    label_df = df[df['label'] == label]
    other_df = df[df['label'] != label]

    corpus = []
    for row in label_df['text']:
        # get review and remove non alpha chars
        new = re.sub('[^a-zA-Z]', ' ', row)
        new = new.lower()
        # split into tokens, apply stemming and remove stop words
        new = ' '.join([ps.stem(w) for w in new.split() if w not in sw])
        corpus.append(new)#new)

    # Join the sentences into a single text
    text = ' '.join(corpus)

    new_row = {'text': text, 'label': label}
    new_df = pd.concat([other_df, pd.DataFrame([new_row])], ignore_index=True)


    X = vectorizer.fit_transform(new_df['text'])
    feature_names_list = vectorizer.get_feature_names_out()

    feature_weights = zip(feature_names_list, X[90000].toarray()[0])
    # Exclude the words from feature_weights
    #feature_weights = [(feature, count) for feature, count in feature_weights if feature != 'ap']
    top_10_features = sorted(feature_weights, key=lambda x: x[1], reverse=True)[:10]
    
    print(f'\nWeights for label {label}')
    print('--------------------------')

    idx = 1
    for feature, count in top_10_features:
        print(f"{idx} - {feature}: {count}") 
        idx += 1
    print()

    # Visualize the top 10 TF-IDF words using a bar plot
    plt.figure(figsize=(8, 4))
    sns.barplot(x=[feature for feature, _ in top_10_features], y=[count for _, count in top_10_features])
    plt.title(f'Top 10 TF-IDF Words for Label: {label}')
    plt.ylabel('TF-IDF Weight')
    plt.xticks(rotation=45, ha='right')
    plt.show()

### Let's clean the text..

In [None]:
corpus = []
ps = PorterStemmer()
sw = set(stopwords.words('english'))
for i in range(0, df['text'].size):
    # get review and remove non alpha chars
    new = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    # to lower-case
    new = new.lower()
    # split into tokens, apply stemming and remove stop words
    new = ' '.join([ps.stem(w) for w in new.split() if w not in sw])
    corpus.append(new)
  
df['text'] = corpus
df.head()

## Text Length Distribution

In [None]:
# Text Length Distribution
df['text_length'] = df['text'].apply(lambda x: len(x.split()))
plt.hist(df['text_length'], bins=30, color='blue', edgecolor='black')
plt.title('Text Length Distribution')
plt.xlabel('Text Length')
plt.ylabel('Frequency')

# Limiting the x-axis to the maximum number of words in a given text
max_text_length = df['text_length'].max()
plt.xlim(0, max_text_length)

plt.show()

In [None]:
# Text Length Distribution
df['text_length'] = df['text'].apply(lambda x: len(x.split()))
plt.hist(df['text_length'], bins=30, color='blue', edgecolor='black')
plt.title('Text Length Distribution')
plt.xlabel('Text Length')
plt.ylabel('Frequency')

# Limiting the x-axis to the maximum number of words in a given text
max_text_length = df['text_length'].max()
plt.xlim(0, max_text_length)

# Zooming the y-axis
plt.ylim(0, 250)  # Adjust the limits as needed

plt.show()


Through this analysis we can conclude that most of the news have text length between 10 and 60. Maybe it will be good if we **discard rows whose news have text-length smaller than 10 or higher than 60**.

## Word Clouds
Word clouds can assist in summarizing the main content of a document or set of documents. The most prominent words in the cloud provide a snapshot of the key information.

In [None]:
# Word Cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(df['text']))
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of all the dataset')
plt.show()

In [None]:
# Correlation Analysis (Example: Word Frequency per Class)
for label in df['label'].unique():
    label_df = df[df['label'] == label]
    label_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(label_df['text']))
    plt.figure(figsize=(10, 6))
    plt.imshow(label_wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for Class: {label}')
    plt.show()

Word clouds provide an intuitive way to visualize the frequency of words in a given text or dataset. The word *"AP AP"* seems to be of little use, appearing many times in all kinds of news, so it **should be removed.**

## Sentiment Analysis
It helps to understand the sentiment trends in each news category, providing insights into the overall tone of the news content.

In [None]:
analyzer = SentimentIntensityAnalyzer()

# Categorize sentiment into positive, negative, and neutral
df['sentiment_category'] = df['text'].apply(lambda x: 'Positive' if analyzer.polarity_scores(x)['compound'] > 0 else 'Negative')

# Show sentiment analysis results
print("Sentiment Analysis Results:")
#print(df[['text', 'label', 'sentiment_category']])
# Explore the relationship between labels and sentiment categories
plt.figure(figsize=(10, 5))
sns.countplot(x='label', hue='sentiment_category', data=df)
plt.title('Sentiment Analysis by News Category')
plt.xlabel('News Category')
plt.ylabel('Count')
plt.show()

*World* news category seems to **have more negative content** than the others.