In [None]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud

In [None]:
# Load your dataset
df = pd.read_csv('datasets/training_data.csv')

# Mapping dictionary
label_mapping = {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}

# Replace numerical labels with words
df['label'] = df['label'].map(label_mapping)

df.head(7)

In [None]:
# Class Distribution
class_distribution = df['label'].value_counts()
print("Class Distribution:\n", class_distribution)

In [None]:
# Exploratory Data Visualization
plt.figure(figsize=(8, 4))
df['label'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
ps = PorterStemmer()
sw = set(stopwords.words('english'))
for i in range(0, df['text'].size):
    # get review and remove non alpha chars
    new = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    # to lower-case
    new = new.lower()
    # split into tokens, apply stemming and remove stop words
    new = ' '.join([ps.stem(w) for w in new.split() if w not in sw])
    corpus.append(new)
  
df['text'] = corpus
df.head()

In [None]:
# Text Length Distribution
df['text_length'] = df['text'].apply(lambda x: len(x.split()))
plt.hist(df['text_length'], bins=30, color='blue', edgecolor='black')
plt.title('Text Length Distribution')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Word Cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(df['text']))
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of all the dataset')
plt.show()

In [None]:
# Correlation Analysis (Example: Word Frequency per Class)
for label in df['label'].unique():
    label_df = df[df['label'] == label]
    label_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(label_df['text']))
    plt.figure(figsize=(10, 6))
    plt.imshow(label_wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for Class: {label}')
    plt.show()

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Categorize sentiment into positive, negative, and neutral
df['sentiment_category'] = df['text'].apply(lambda x: 'Positive' if analyzer.polarity_scores(x)['compound'] > 0 else 'Negative')

# Print the sentiment analysis results
print("Sentiment Analysis Results:")
print(df[['text', 'label', 'sentiment_category']])


In [None]:
import seaborn as sns

# Explore the relationship between labels and sentiment categories
plt.figure(figsize=(10, 5))
sns.countplot(x='label', hue='sentiment_category', data=df)
plt.title('Sentiment Analysis by News Category')
plt.xlabel('News Category')
plt.ylabel('Count')
plt.show()

### Some conclusions..
- word 'AP AP' seems to be unuseful, appearing in all the type of news, so it must me removed
- should we delete from the dataset rows with only a few caracters?