<a href="https://colab.research.google.com/github/MK316/Spring2024/blob/main/Corpus/TEDdata/will-be/Will_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data analysis: finding 'will' and 'be going to' patterns

Written by MK316 (0606): Run the code and see how to get the result.

+ You can get the result file in this page (also uploaded in your repository): will-begoingto.csv

In [None]:
# Data to read

import pandas as pd

datalink = "https://raw.githubusercontent.com/MK316/Spring2024/main/Corpus/TEDdata/Cleantext0605F.csv"
data = pd.read_csv(datalink, encoding="utf-8")
data.tail()

## Selected columns to analyze

e.g., data => df: with selected columns (excluding the original text)

+ Rename: Cleaned_length > Length, Cleanedtext01 > CText

In [None]:
import pandas as pd

# Select specific columns
columns_to_select = ['TID','Cleaned_length','Wordcount','Cleanedtext01',]  # You can modify this list to select different columns
df = data[columns_to_select]

# Rename the columns
df = df.rename(columns={'Cleaned_length': 'Length', 'Cleanedtext01': 'CText'})

# Display the new DataFrame
print(df)


## [1] Count 'will' and add a new column

In [None]:
import pandas as pd
import re

# Function to count complete matches of the word 'will'
def count_exact_match(text, word):
    # Use regex to find whole word matches only
    return len(re.findall(r'\b{}\b'.format(re.escape(word)), text, re.IGNORECASE))

# Apply the function to the 'CText' column
df['Nwill'] = df['CText'].apply(lambda text: count_exact_match(text, 'will'))

# Display the DataFrame to see the result
print(df)


## [2] Count 'be going to' and add a new column

Explanation of the code:

1. Data Preparation: We define a DataFrame df containing the provided example data. This includes texts that should have instances of the different forms of "be going to".
2. Regular Expression Function: The function count_be_going_to utilizes the regular expression defined in pattern to match any form of "am going to", "is going to", "are going to", "was going to", and "were going to". The \b ensures these are whole words (not part of larger words).
3. Apply Function: We use df['CText'].apply(count_be_going_to) to apply our counting function to each row in the CText column. 4. The result is stored in a new column NbeGoingTo.
Result Display: We print the updated DataFrame to verify our results.
5. Regular Expression Pattern: The pattern now includes contractions like "I'm", "he's", "she's", "it's", "they're", "you're", and "we're" along with the original forms.
6. Test Data: The example texts in the DataFrame have been updated to include these contraction forms to demonstrate that the updated function captures them correctly.

This is a test version with a sample data (before we conduct our data)

In [None]:
import pandas as pd
import re

# Create the DataFrame
data = {
    'TID': [1, 2],
    'Length': [9008, 6663],
    'Wordcount': [1657, 1223],
    'CText': [
        "I'm going to explain the use of voice. We are going to see its effects. He's going to call yesterday.",
        "They're going to participate in the event. She is going to travel tomorrow. You were going to tell me that story."
    ]
}
df1 = pd.DataFrame(data)

# Function to count occurrences of "be going to" in all forms including contractions
def count_be_going_to(text):
    # Pattern to match different forms of "be going to" including contractions
    pattern = r"\b(?:am|is|are|was|were|I'm|he's|she's|it's|they're|you're|we're) going to\b"
    # Find all matches with case insensitive search
    return len(re.findall(pattern, text, re.IGNORECASE))

# Apply the function to each row in 'CText'
df1['NbeGoingTo'] = df1['CText'].apply(count_be_going_to)

# Display the DataFrame to see the result
print(df1)


+ Our data to process

In [None]:
import pandas as pd
import re

# Function to count occurrences of "be going to" in all forms including contractions
def count_be_going_to(text):
    # Pattern to match different forms of "be going to" including contractions
    pattern = r"\b(?:am|is|are|was|were|I'm|he's|she's|it's|they're|you're|we're) going to\b"
    # Find all matches with case insensitive search
    return len(re.findall(pattern, text, re.IGNORECASE))

# Apply the function to each row in 'CText'
df['NbeGoingTo'] = df['CText'].apply(count_be_going_to)

# Display the DataFrame to see the result
print(df)


In [None]:
df.describe()

# Spearman's rank correlation coefficient: Nwill and NbeGoingTo

+ Spearman's rank correlation coefficient instead of Pearson's correlation coefficient. Spearman's correlation is more suitable for ordinal data or non-normally distributed variables, which is often the case with count data.

In [None]:
import pandas as pd
from scipy.stats import spearmanr

# Calculate Spearman's rank correlation coefficient
corr_wordcount_nwill = spearmanr(df['Wordcount'], df['Nwill'])
corr_wordcount_nbegoingto = spearmanr(df['Wordcount'], df['NbeGoingTo'])
corr_nwill_nbegoingto = spearmanr(df['Nwill'], df['NbeGoingTo'])

# Print the results
print("Correlation between Wordcount and Nwill:", corr_wordcount_nwill)
print("Correlation between Wordcount and NbeGoingTo:", corr_wordcount_nbegoingto)
print("Correlation between Nwill and NbeGoingTo:", corr_nwill_nbegoingto)


### Interpretation of Results

Results:

1) Correlation between Wordcount and Nwill: SignificanceResult(statistic=0.29014237616704847, pvalue=0.0034091910932080175)

2) Correlation between Wordcount and NbeGoingTo: SignificanceResult(statistic=0.4254841848162721, pvalue=1.0196562679335068e-05)

3) Correlation between Nwill and NbeGoingTo: SignificanceResult(statistic=0.21150465018464754, pvalue=0.0346508356304406)

### 1. Correlation between Wordcount and Nwill:

+ Coefficient (0.2901): This value suggests a weak positive correlation between the Wordcount and Nwill. This indicates that as the Wordcount increases, the count of the exact word "will" (Nwill) tends to increase slightly.
+ P-value (0.0034): The p-value is less than 0.05, indicating that the correlation is statistically significant. This suggests that the observed correlation is unlikely to be due to random chance.

### 2. Correlation between Wordcount and NbeGoingTo:

+ Coefficient (0.4255): This value suggests a moderate positive correlation between the Wordcount and NbeGoingTo. As Wordcount increases, the instances of "be going to" phrases also tend to increase.
+ P-value (about 0.00001): This extremely low p-value strongly suggests that the correlation is statistically significant and not due to random variation.

### 3. Correlation between Nwill and NbeGoingTo:

+ Coefficient (0.2115): This value indicates a weak positive correlation between the occurrences of "will" and the "be going to" phrases. While there is some level of association, it is relatively weak.
+ P-value (0.0347): This p-value is just below the 0.05 threshold, suggesting that the correlation is statistically significant, although it's close to the boundary where we might consider it insignificant.

## Normalise the occurrences: Occurrences per 1000 words

In [None]:
import pandas as pd

# Normalize counts per 1000 words
df['Nwill_per_1000'] = df['Nwill'] / df['Wordcount'] * 1000
df['NbeGoingTo_per_1000'] = df['NbeGoingTo'] / df['Wordcount'] * 1000

# Display the first few rows to verify the results
print(df.head())

# Calculate and print Spearman's rank correlation on normalized data
from scipy.stats import spearmanr
corr_normalized_nwill_nbegoingto = spearmanr(df['Nwill_per_1000'], df['NbeGoingTo_per_1000'])
print("Correlation between normalized Nwill and NbeGoingTo:", corr_normalized_nwill_nbegoingto)


In [None]:
df.to_csv('will-begoingto.csv', index=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming df is your DataFrame with the relevant data
sns.scatterplot(x='Nwill', y='NbeGoingTo', data=df)
plt.title('Scatter Plot of Nwill vs NbeGoingTo')
plt.xlabel('Nwill')
plt.ylabel('NbeGoingTo')
plt.show()


In [None]:
sns.pairplot(df[['Nwill', 'NbeGoingTo', 'Wordcount']])
plt.show()


=> Skewed Distributions: Both Nwill and NbeGoingTo are heavily skewed to the right, meaning that in most texts, these words/phrases appear infrequently. This could suggest specialized usage or contextual dependence of these terms.
Weak Correlations: The weak correlations between Nwill and NbeGoingTo, and their respective correlations with Wordcount, suggest that the frequency of these terms is influenced by factors other than just the length of the text. This might include the style, genre, or specific thematic content of the texts.

## Clustering

To perform cluster analysis on your data to explore patterns in the occurrences of Nwill, NbeGoingTo, and their relationship with Wordcount, you can use the K-means clustering algorithm from the scikit-learn library. This method will help you identify groups (clusters) of texts that exhibit similar characteristics in terms of these variables.

Here's how you can implement this in Python:

Step-by-Step Guide for K-means Clustering
Data Preparation: Ensure your data is suitable for clustering. You might consider normalizing the data due to different scales, especially if Wordcount is significantly higher in magnitude than Nwill and NbeGoingTo.

Perform Clustering: Use the K-means algorithm to cluster the data.

Visualize the Results: Create plots to visualize the clusters for better interpretation.

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Normalize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[['Wordcount', 'Nwill', 'NbeGoingTo']])

# Perform K-means clustering
kmeans = KMeans(n_clusters=3, random_state=0)  # Choose the number of clusters
df['cluster'] = kmeans.fit_predict(df_scaled)

# Visualizing the clusters
sns.pairplot(df, vars=['Wordcount', 'Nwill', 'NbeGoingTo'], hue='cluster', palette='viridis')
plt.suptitle('Pair Plot of Text Data by Cluster', y=1.02)
plt.show()

# Optionally, analyze cluster centers
centers = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=['Wordcount', 'Nwill', 'NbeGoingTo'])
print("Cluster centers (in original scale):")
print(centers)


In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Normalize the data
# Even though the data is already normalized per 1000 words, standardizing can still help to equalize variance.
scaler = StandardScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df[['Nwill_per_1000', 'NbeGoingTo_per_1000']]),
                             columns=['Nwill_per_1000', 'NbeGoingTo_per_1000'])

# Perform K-means clustering
kmeans = KMeans(n_clusters=3, random_state=0)  # Choose the number of clusters based on domain knowledge or analysis
df_normalized['cluster'] = kmeans.fit_predict(df_normalized)

# Visualizing the clusters using normalized data
sns.pairplot(df_normalized, vars=['Nwill_per_1000', 'NbeGoingTo_per_1000'], hue='cluster', palette='viridis')
plt.suptitle('Pair Plot of Normalized Text Data by Cluster', y=1.02)
plt.show()

# Optionally, print normalized cluster centers
print("Normalized cluster centers:")
print(kmeans.cluster_centers_)


=> Interpreting Values:

Positive Values: A positive z-score indicates that the feature's value is above the overall mean of that feature across all data points.
Negative Values: A negative z-score indicates that the feature's value is below the overall mean.

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Normalize the data
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[['Nwill_per_1000', 'NbeGoingTo_per_1000']]),
                         columns=['Nwill_per_1000', 'NbeGoingTo_per_1000'])

# Perform K-means clustering
kmeans = KMeans(n_clusters=3, random_state=0)  # Assume the optimal number of clusters is 3
df['cluster'] = kmeans.fit_predict(df_scaled)

# Grouping the TIDs by cluster
cluster_groups = df.groupby('cluster')['TID'].apply(list)

# Print the TIDs in each cluster
print("Text IDs in each cluster:")
for cluster, tids in cluster_groups.items():
    print(f"Cluster {cluster}: Text IDs {tids}")


# Total text file (txt)


In [None]:
#@markdown Searching a match (complete or partial)

# 3) Get user input for the word to find
search_word = input("Enter the word to find: ")
match_type = input("Type 'c' complete matches only, or 'p' for partial matches: ").lower()

# 4) Function to find occurrences
def find_occurrences(text, word, match_type):
    occurrences = []
    position = 0
    while True:
        if match_type == 'c':
            # Find whole words only by using boundaries
            position = text.lower().find(f' {word.lower()} ', position)
        else:
            position = text.lower().find(word.lower(), position)

        if position == -1:  # No more occurrences found
            break
        # Calculate start and end positions for slicing
        start = max(0, position - 30)
        end = min(len(text), position + len(word) + 30)
        occurrences.append(text[start:end])
        position += len(word)  # Move past this occurrence

    return occurrences

occurrences = find_occurrences(combined_text, search_word, match_type)

# 5) Decide how many occurrences to display
print(f"Total occurrences found: {len(occurrences)}")
print("="*50)
if len(occurrences) > 10:
    choice = input("More than 10 occurrences found. Type 'a' to display all or '10' to display only the first 10: ").lower()
    print("="*50)
    if choice == '10':
        occurrences = occurrences[:10]

# 6) Display occurrences
for occurrence in occurrences:
    print(occurrence)

# 7) Print summary
print("="*50)