## 2.0 Analysis of 20 NewsGroups data

This is the finalised version  the Analysis section. Included are:


1) Summary Statistics of Data

2) Most common words for each category

3) Analysis of Parts of Speech 

4) Bigram analysis


data imported as csv after text cleaning, but pre-vectorising.


In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
proj_dir='/content/drive/MyDrive/Colab Notebooks/doc2vec/' #give your project directory here. data sets should be in this location

In [None]:
pd.set_option('display.max_columns', None)  

In [None]:
''' loading the cleaned data '''

train_cleaned_df = pd.read_csv(proj_dir+'train_cleaned_data.csv')
train_cleaned_df.head()

# 1. Summary statistics of data

In [None]:
# number of documents
print(len(train_cleaned_df))

In [None]:
# number documents in each category
print(train_cleaned_df["folder name"].value_counts())

# output as a pandas dataframe
data_summary_df = pd.DataFrame(train_cleaned_df["folder name"].value_counts()).reset_index()

In [None]:
data_summary_df

In [None]:
# Fix NaNs
train_cleaned_df['text cleaned'] = np.where(train_cleaned_df['text cleaned'].isnull(),train_cleaned_df['folder name'],train_cleaned_df['text cleaned'])

In [None]:
# count words per category
(train_cleaned_df.assign(text=train_cleaned_df['text cleaned'].str.split()).explode("text cleaned")
 .groupby("folder name",sort=False)['text cleaned'].value_counts())

In [None]:
# Group Data by category
train_cl_gr_df   = train_cleaned_df.groupby('folder name').agg({'text cleaned': ','.join}).reset_index()
train_cl_gr_df

In [None]:
# total words per group
train_cl_gr_df['total words'] =train_cl_gr_df['text cleaned'].apply(lambda x: len(x.split()))

data_summary_df['total words'] = train_cl_gr_df['text cleaned'].apply(lambda x: len(x.split()))

# average words per doc

data_summary_df['average words'] = data_summary_df['total words'] / data_summary_df['folder name']

data_summary_df

# 2. Most common words in each category

In [None]:
'''

Plots of  most common words by category

'''
from collections import Counter
import matplotlib.pyplot as plt
import re

pd.set_option('display.max_colwidth', None)  
for folder in train_cl_gr_df['folder name']:
    target_df = train_cl_gr_df[(train_cl_gr_df['folder name'] == folder)]

    print ("target_df: ", len(target_df))
    target_df.head(10)
    string_text =  target_df['text cleaned'].to_string()

    string_text=string_text.replace('\n',' ')
    string_text = re.sub(r"[^A-Za-z0-9 ]+", "", string_text)

    print(string_text)


    vocab ={}
    for word in string_text.split():
          if len(word) != 1: 
                if vocab.get(word.lower()) != None:
                    vocab[word.lower()] += 1

                    ##If word is not in dictionary then we put that word in our dictinary by making its frequnecy 1
                else:
                    vocab[word.lower()] = 1
                
    d = Counter(vocab)

    most_common = d.most_common(10)

    print('Most Common Words: ' , folder  , d)
    
    words = [word for word, _ in most_common]
    counts = [counts for _, counts in most_common]

    plt.bar(words, counts)
    plt.title("10 most Common Words Category: "  + str(folder))
    plt.ylabel("Frequency")
    plt.xlabel("Words")
    plt.rcParams['figure.figsize'] = [15, 5]
    plt.figure(figsize=(20,10))
    plt.show()


## 3. Part of Speech Analysis

Identify number of nouns and verbs etc in each group and display as a stacked bar chart

In [None]:
import nltk
from nltk import pos_tag, word_tokenize, tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def NounCount(x):
    nounCount = sum(1 for word, pos in pos_tag(word_tokenize(x)) if pos.startswith('NN'))
    return nounCount

train_cl_gr_df["nouns"] = train_cl_gr_df["text cleaned"].apply(NounCount)

In [None]:
def VerbCount(x):
  
    verbCount = sum(1 for word, pos in pos_tag(word_tokenize(x)) if pos.startswith('VB'))
    return  verbCount

train_cl_gr_df["verbs"] = train_cl_gr_df["text cleaned"].apply(VerbCount)

In [None]:
# classify adverbs
def AdverbCount(x):
  
    adverbCount = sum(1 for word, pos in pos_tag(word_tokenize(x)) if pos.startswith('RB'))
    return  adverbCount

train_cl_gr_df["adverbs"] = train_cl_gr_df["text cleaned"].apply(AdverbCount)

In [None]:
# classify adjectives
def AdjectiveCount(x):
  
    adjectiveCount = sum(1 for word, pos in pos_tag(word_tokenize(x)) if pos.startswith('JJ'))
    return  adjectiveCount

train_cl_gr_df["adjectives"] = train_cl_gr_df["text cleaned"].apply(AdjectiveCount)

In [None]:
# classify other
def OtherCount(x):
  
    otherCount = sum(1 for word, pos in pos_tag(word_tokenize(x)) if not pos.startswith(('NN', 'VB', 'RB','JJ')))
    return  otherCount

train_cl_gr_df["other"] = train_cl_gr_df["text cleaned"].apply(OtherCount)

In [None]:
train_cl_gr_df[["folder name", "nouns","verbs", "adverbs", "adjectives", "other" ]]

In [None]:
# visualisation to show split of word types

ax = train_cl_gr_df.plot.barh(stacked=True,  title='POS Categorisation', x='folder name')

In [None]:
'''Scaled - proportions of POS
Show the POS values as the proportion of the total number of words for each category

'''
df = train_cl_gr_df[[ "nouns","verbs", "adverbs", "adjectives", "other" ]]
df.loc[:,'Row_Total'] = df.sum(numeric_only=True, axis=1)

df['nouns'] = 100* (df['nouns'] / df['Row_Total'] )
df['verbs'] = 100* (df['verbs'] / df['Row_Total'] )
df['adverbs'] = 100* (df['adverbs'] / df['Row_Total'] )
df['adjectives'] = 100* (df['adjectives'] / df['Row_Total'] )
df['other'] = 100* (df['other'] / df['Row_Total'] )
df = df[[ "nouns","verbs", "adverbs", "adjectives", "other" ]]

df2 = train_cl_gr_df[['folder name']]

train_pos_df_scaled =df2.join(df)

In [None]:
ax = train_pos_df_scaled.plot.barh(stacked=True,  title='POS Categorisation (Scaled)', x='folder name')

# 4. Bigrams

This section calculates the most frequently occuring bigrams

In [None]:
from nltk import FreqDist
fd = FreqDist(train_cl_gr_df[['text cleaned']].to_string().split())
fd.plot(20)

In [None]:
#frequency of bigrams
from nltk import bigrams
import matplotlib.pyplot as plt
#fig = plt.figure(figsize = (10,4))
fd_bg = FreqDist(map(' '.join, bigrams(train_cl_gr_df[['text cleaned']].to_string().split())))
fd_bg.plot(50, title='Top 30 Most Common Bigrams in Whole Text')

fig.savefig('freqDist.png', bbox_inches = "tight")

In [None]:
# Bigrams :  pointwise mutual information

from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder

BigramCollocationFinder.from_words(train_cl_gr_df[['text cleaned']].to_string().split()).\
    nbest(BigramAssocMeasures().pmi, 20)

In [None]:
BigramCollocationFinder.from_words(train_cl_gr_df[['text cleaned']].to_string().split()).\
    score_ngrams(BigramAssocMeasures().pmi)

In [None]:
##  20.39 --> there isn't any difference in significance between these first few hundred bigrams.

In [None]:
from nltk.collocations import TrigramAssocMeasures, TrigramCollocationFinder
TrigramCollocationFinder.from_words(train_cl_gr_df[['text cleaned']].to_string().split()).\
    nbest(TrigramAssocMeasures().pmi, 10)