# NLP Analysis

# Imports

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

np.random.seed(1)

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Functions

In [17]:
from collections import Counter

import torch


def read_dataframes(PATH, filepaths):

  dataframes = []

  for file in filepaths:
    dataframes.append(pd.read_csv(PATH + file +'.csv'))
  
  return dataframes

def check_class_imbalance(df):
  y = df.values[:,-1]
  counter = Counter(y)
  for k,v in counter.items():
    per = v/len(y) * 100
    print("Class=%s, Count=%d, Percentage=%.3f%%" % (k,v, per))

def sentiment_score(review, tokenizer, model):
  tokens = tokenizer.encode(review, return_tensors='pt')
  result = model(tokens)
  return int(torch.argmax(result.logits)) +1

# Read in Dataframes

In [6]:
PATH = "/Users/maukanmir/Downloads/archive/"

filepaths = ['Education', 'Sports', 'Finance', 'Politics']

In [9]:
edu_df, sport_df, finance_df, politics_df = read_dataframes(PATH, filepaths)

# EDA Analysis

In [11]:

data_frames = [edu_df, sport_df, finance_df, politics_df ]

for df in data_frames:
  print(f"Number of NA values: {df.isna().sum()}")
  print(f"Number of Duplicated values: {df.duplicated().sum()}")
  print(df.shape)

Number of NA values: Text     0
Label    0
dtype: int64
Number of Duplicated values: 0
(52, 2)
Number of NA values: Text     0
Label    0
dtype: int64
Number of Duplicated values: 0
(56, 2)
Number of NA values: Text     0
Label    0
dtype: int64
Number of Duplicated values: 0
(48, 2)
Number of NA values: Text     0
Label    0
dtype: int64
Number of Duplicated values: 0
(53, 2)


# Check for class imbalances

In [15]:
for topic, df in zip(filepaths, data_frames):
  print("------------------------")
  print(f"Topic is {topic}")
  check_class_imbalance(df)

------------------------
Topic is Education
Class=positive, Count=26, Percentage=50.000%
Class=negative, Count=26, Percentage=50.000%
------------------------
Topic is Sports
Class=positive, Count=28, Percentage=50.000%
Class=negative, Count=28, Percentage=50.000%
------------------------
Topic is Finance
Class=positive, Count=34, Percentage=70.833%
Class=negative, Count=14, Percentage=29.167%
------------------------
Topic is Politics
Class=positive, Count=25, Percentage=47.170%
Class=negative, Count=28, Percentage=52.830%


In [16]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
for df in data_frames:
  df["Label"] = df["Label"].apply(lambda x: 1 if x == 'negative' else 0)

In [22]:

for df in data_frames:
  df["BERT_score"] = df["Text"].apply(lambda review: sentiment_score(review, tokenizer, model))
