# Label text analysis

Analysing the frequency distribution of words in the detailed labels in the pass_results.csv file.

In [None]:
# Import relevant libraries
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')

import matplotlib.pyplot as plt

In [None]:
# Read CSV file into a DataFrame
df = pd.read_csv(r'C:\Users\504631\OneDrive - Amey plc\Documents\GitHub\va-poc\karolina_code\csv_files\pass_result.csv')

In [None]:
# Display information about dataset
print(df.head())
print(df.info())

## Frequency Analysis

In [None]:
# Tokenize the labels
tokens = [word_tokenize(label) for label in df['label']]

# Flatten the list of tokens
flat_tokens = [token for sublist in tokens for token in sublist]

# Calculate word frequencies
freq_dist = FreqDist(flat_tokens)

# Print the most common words
print(freq_dist.most_common(10))

In [None]:
# Plot the frequency distribution
freq_dist.plot(30, cumulative=False)
plt.show()

In [None]:
df_freq = pd.DataFrame(freq_dist.items(), columns=['word', 'frequency'])

In [None]:
# Display information about dataset
print(df_freq.head())
print(df_freq.info())

In [None]:
# Remove first row
df_freq = df_freq.iloc[1: , :]

# Sort by frequency
df_freq = df_freq.sort_values(['frequency'], ascending=[False])

In [None]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(df_freq)

## Frequency Analysis using pre-processing

Use pre-processing to remove stopwords and perform stemming

In [None]:
# Read CSV file into a DataFrame
df2 = pd.read_csv(r'C:\Users\504631\OneDrive - Amey plc\Documents\GitHub\va-poc\karolina_code\csv_files\pass_result.csv')

In [None]:
# Apply pre-processing to labels
# Tokenize, remove stopwords, and perform stemming
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_label(label):
    words = word_tokenize(label)
    words = [ps.stem(word.lower()) for word in words if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(words)

# Apply the preprocessing function to the 'labels' column
df2['normalized_label'] = df2['label'].apply(preprocess_label)

# Calculate word frequencies for the normalized labels
normalized_tokens = [word_tokenize(label) for label in df2['normalized_label']]
flat_normalized_tokens = [token for sublist in normalized_tokens for token in sublist]
freq_dist_normalized = FreqDist(flat_normalized_tokens)

# Display the most common normalized words
print(freq_dist_normalized.most_common(10))

In [None]:
# Plot the frequency distribution
freq_dist_normalized.plot(30, cumulative=False)
plt.show()

In [None]:
# Create a DataFrame from the frequency distribution
labels_freq_df = pd.DataFrame(freq_dist_normalized.items(), columns=['Label', 'Frequency'])

# Sort the DataFrame by frequency in descending order
labels_freq_df = labels_freq_df.sort_values(by='Frequency', ascending=False)

# Increase the maximum number of rows and columns to display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Display the DataFrame
print(labels_freq_df)