# Data Science Portfolio (Part2)


In [None]:
import numpy as np
import string
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import math
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# the code block below is directly downloading commentary.txt and superheros.csv into your drive folder. Please just run it and do not comment out.
from urllib import request
module_url = [f"https://drive.google.com/uc?export=view&id=18y6hLv2bqAyJsIXwVCty58lF0u7yimVq"]
name = ['commentary.txt']
for i in range(len(name)):
    with request.urlopen(module_url[i]) as f, open(name[i],'w') as outf:
        a = f.read()
        outf.write(a.decode('ISO-8859-1'))
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import nltk
import re
from tqdm import tqdm
tqdm.pandas()
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

#Text Analysis

In this question, we will interrogate the football commentary dataset

In [None]:
df = pd.read_csv('commentary.txt', sep='\t')
df.head()

Unnamed: 0,Minute,Commentary
0,97,Plenty of chances in this game but neither tea...
1,97,That's it! The referee blows the final whistle
2,97,"Ball possession: Tottenham: 44%, Liverpool: 56%."
3,96,James Milner relieves the pressure with a clea...
4,96,Poor play by Trent Alexander-Arnold as his wea...


## Step 1 Preprocessing

Implement a method for obtaining tokenized, PoS-tagged and PoS-tagged and lemmatized versions of the Commentary column. Create 3 new columns: `Tokenized`, `PoS_tagged` and `PoS_lemmatized`, and create them in order:

1.- New `Tokenized` column, by lower casing and tokenizing the `Commentary` column.

2.- New `PoS_tagged` column, by pos_tagging the `Tokenized` column.

3.- New `PoS_lemmatized` column, by lemmatizing only the words in the `PoS_tagged` column. The reason for doing it in this order is to present to the tagging function the original text.



In [None]:
df = pd.read_csv('commentary.txt', sep='\t')

# Tokenize the Commentary column and create Tokenized column
df['Tokenized'] = df['Commentary'].apply(lambda x: word_tokenize(x.lower()))

# PoS tagging on the Tokenized column and create PoS_tagged column
df['PoS_tagged'] = df['Tokenized'].apply(lambda x: pos_tag(x))
# Initialize WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Map NLTK PoS tags to WordNet PoS tags
def map_pos_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return 'a'  # Adjective
    elif nltk_tag.startswith('V'):
        return 'v'  # Verb
    elif nltk_tag.startswith('N'):
        return 'n'  # Noun
    elif nltk_tag.startswith('R'):
        return 'r'  # Adverb
    else:
        return None  # None by default, can be adjusted as needed

# Function to lemmatize words based on PoS tags
def lemmatize_word(word, pos_tag):
    pos_tag_original = pos_tag
    pos_tag = map_pos_tag(pos_tag)
    if pos_tag:
        return (lemmatizer.lemmatize(word, pos_tag), pos_tag_original)
    else:
        return (lemmatizer.lemmatize(word),pos_tag_original)

# Lemmatize words in PoS_tagged column and create PoS_lemmatized column
df['PoS_lemmatized'] = df['PoS_tagged'].apply(lambda x: [lemmatize_word(word, pos_tag) for word, pos_tag in x])


In [None]:
>>print(df['Tokenized'][:3])
0    [plenty, of, chances, in, this, game, but, nei...
1    [that, 's, it, !, the, referee, blows, the, fi...
2    [ball, possession, :, tottenham, :, 44, %, ,, ...
Name: Tokenized, dtype: object

>>print(df['PoS_tagged'][:3])
0    [(plenty, NN), (of, IN), (chances, NNS), (in, ...
1    [(that, DT), ('s, VBZ), (it, PRP), (!, .), (th...
2    [(ball, DT), (possession, NN), (:, :), (totten...
Name: PoS_tagged, dtype: object

>>print(df['PoS_lemmatized'][:3])
0    [(plenty, NN), (of, IN), (chance, NNS), (in, I...
1    [(that, DT), ('s, VBZ), (it, PRP), (!, .), (th...
2    [(ball, DT), (possession, NN), (:, :), (totten...
Name: PoS_lemmatized, dtype: object

In [None]:
print(df['Tokenized'][:3])

0    [plenty, of, chances, in, this, game, but, nei...
1    [that, 's, it, !, the, referee, blows, the, fi...
2    [ball, possession, :, tottenham, :, 44, %, ,, ...
Name: Tokenized, dtype: object


In [None]:
print(df['PoS_tagged'][:3])

0    [(plenty, NN), (of, IN), (chances, NNS), (in, ...
1    [(that, DT), ('s, VBZ), (it, PRP), (!, .), (th...
2    [(ball, DT), (possession, NN), (:, :), (totten...
Name: PoS_tagged, dtype: object


In [None]:
print(df['PoS_lemmatized'][:3])

0    [(plenty, NN), (of, IN), (chance, NNS), (in, I...
1    [(that, DT), ('s, VBZ), (it, PRP), (!, .), (th...
2    [(ball, DT), (possession, NN), (:, :), (totten...
Name: PoS_lemmatized, dtype: object


## Step 2 Basic search engine

Implement a basic search engine in a function called `retrieve_similar_commentaries(df, query, k)`, which takes as input the following arguments:

- `df` the previously enriched (tokenized, pos tagged, etc) commentary dataframe.
- `query` a string of any type, which will be the query we will be using to retrieve similar commentaries.
- `k` and integer denoting the top `k` commentaries to be returned (by similarity).

Function performs the following steps:

1 - Tokenize and lemmatize the input query.

2 - For each commentary in the df, compute how similar it is to the query as the number of shared tokens between query and commentary.

3 - We will prioritize noun matches, so our similarity score will receive +1 if at least one of the matching tokens in the commentary is a noun (i.e., its part of speech starts with `N`). This means that, for example, if your query has 2 tokens, the maximum similarity a commentary can have is 4: 2 for 2 overlapping tokens, and 2 for both tokens being nouns.

4 - The function must return a list of tuples of the form `[(commentary1, sim), (commentary2, sim) ... (commentaryk, sim)]`, where commentaries are ranked by `sim` value in descending order.


```

In [None]:
def retrieve_similar_commentaries(df, query, k):
    """
    Retrieve the top k commentaries from a DataFrame that are most similar to the given query.

    Args:
        df (DataFrame): The DataFrame containing commentaries and their attributes.
        query (str): The query for which similar commentaries are to be retrieved.
        k (int): The number of similar commentaries to retrieve.

    Returns:
        list: A list of tuples containing the top k similar commentaries and their similarity scores.

    Note:
        This function assumes that the DataFrame `df` has columns named 'Tokenized' and 'PoS_tagged',
        which contain preprocessed tokens and part-of-speech tagged tokens for each commentary respectively.
        The 'Tokenized' column should contain tokenized and lemmatized versions of the commentaries.
        The 'PoS_tagged' column should contain part-of-speech tagged tokens of the commentaries.
    """
    # Initialize WordNet lemmatizer
    lemmatizer = WordNetLemmatizer()
    # Tokenize and lemmatize the input query
    query_tokens = [lemmatizer.lemmatize(token.lower()) for token in word_tokenize(query)]

    # Initialize list to store similarity scores
    similarity_scores = []

    # Iterate through each commentary in the dataframe
    for index, row in df.iterrows():
        # Tokenize and lemmatize the commentary
        commentary_tokens = row['Tokenized']

        # Compute similarity as the number of shared tokens between query and commentary
        shared_tokens = set(query_tokens) & set(commentary_tokens)

        # Initialize similarity score
        similarity_score = len(shared_tokens)
        tokens = []
        # # Check if at least one of the matching tokens in the commentary is a noun
        # if any(token[1].startswith('N') for token in row['PoS_tagged'] if token[0] in shared_tokens):
        #     similarity_score += 1
        for token in row['PoS_tagged']:
          if (token[0] in shared_tokens) and (token[1].startswith('N')) and (token not in tokens):
            similarity_score += 1
            tokens.append(token)

        # Append (commentary, similarity) tuple to list
        similarity_scores.append((row['Commentary'], similarity_score))

    # Sort similarity scores in descending order
    similarity_scores.sort(key=lambda x: x[1], reverse=True)

    # Return top k commentaries
    return similarity_scores[:k]

In [None]:
result = retrieve_similar_commentaries(df, "Manchester United ball", 3)
for idx,r in enumerate(result):
  print(idx,r)

0 ('Manchester United is in control of the ball.', 5)
1 ('Manchester United is in control of the ball.', 5)
2 ('Jadon Sancho from Manchester United crosses the ball, but it goes out for a goal kick.', 5)


##Step3  - PMI
Implement and apply the pointwise mutual information (PMI) metric, a word association metric introduced in 1992, to the football commentaries. The purpose of PMI is to extract, from free text, pairs of words or phrases than tend to co-occur together more often than expected by chance. For example, PMI(`new`, `york`) would give a higher score than PMI(`new`, `car`) because the chance of finding `new` and `york` together in text is higher than `new` and `car`, despite `new` being a more frequent word than `york`.

The formula for PMI (where `x` and `y` are two words) is:

$PMI(x,y) = log(\frac{p(x,y)}{p(x)p(y)})$




- **Phrase Extraction**: The first step is to extract noun phrases (NPs) and verb phrases (VPs) from the lemmatized data. We will reward cases where NPs and VPs go beyond single word matching.

- **Phrase Counting**: Once have extracted the NPs and VPs, you'll need to count how many times each phrase occurs in the dataset.

- **Total Counts**: The next step is to compute the total count of all NPs and VPs. This is simply the sum of all the counts in the dictionaries created in the previous step.

- **Identifying Top Phrases**: To reduce computational complexity, we only want to compute PMI for the top occurring NPs and VPs.
- **Creating the PMI Matrix**: Finally, create a PMI matrix using the top NPs and VPs, their counts, and the total counts of NPs and VPs. This matrix is a pandas DataFrame, which will have rows corresponding to the top VPs, columns corresponding to the top NPs, and each cell will contain the PMI value between the corresponding NP and VP.

In [None]:
def extract_phrases(tagged_tokens):
    """
    Extracts noun phrases (NP) and verb phrases (VP) from tagged tokens.

    Args:
    tagged_tokens (list): A list of tuples containing (word, tag) pairs.

    Returns:
    tuple: A tuple containing two lists, the first for noun phrases and the second for verb phrases.
    """
    # Define the grammar for noun phrases (NP) and verb phrases (VP)
    grammar = r"""
        NP: {<DT>?<JJ>*<NN>*} # NP
        VP: {<VB.*><DT>?<JJ>*<NN|NNS>*} # VP
    """
    # Create a chunk parser using the defined grammar
    cp = nltk.RegexpParser(grammar)
    # Parse the tagged tokens
    parsed_tree = cp.parse(tagged_tokens)
    # Initialize lists to store NP and VP phrases
    np_phrases = []
    vp_phrases = []

    # Traverse the parsed tree and extract NP and VP phrases
    for subtree in parsed_tree.subtrees():
        if subtree.label() == 'NP':
            np_phrases.append(' '.join([token[0] for token in subtree.leaves()]))
        elif subtree.label() == 'VP':
            vp_phrases.append(' '.join([token[0] for token in subtree.leaves()]))


    return np_phrases, vp_phrases


def count_phrases_in_df(df, phrases):
    """
    Count occurrences of each NP and VP in the 'commentary' column of the DataFrame.

    Args:
    df (DataFrame): A pandas DataFrame containing the 'commentary' column.
    phrases (dict): A dictionary containing lists of noun phrases (nps) and verb phrases (vps).

    Returns:
    tuple: A tuple containing two nltk.FreqDist objects, the first for NP counts and the second for VP counts.
    """
    # Create FreqDist objects to count occurrences of NP and VP phrases
    np_counts = nltk.FreqDist(phrases['nps'])
    vp_counts = nltk.FreqDist(phrases['vps'])
    return np_counts, vp_counts

def merge_and_get_top_phrases(np_counts, vp_counts):
    """
    Merge NP and VP counts, calculate the top 100 most frequent phrases,
    and organize them into a dictionary.

    Args:
    np_counts (nltk.FreqDist): A FreqDist object containing counts of NP phrases.
    vp_counts (nltk.FreqDist): A FreqDist object containing counts of VP phrases.

    Returns:
    dict: A dictionary containing lists of the top 100 most frequent NP and VP phrases.
    """
    # Get the top 100 most common NP and VP phrases
    top_nps = np_counts.most_common(100)
    top_vps = vp_counts.most_common(100)
    # Initialize a dictionary to store the top NP and VP phrases
    top_phrases_dict = {'nps': [], 'vps': []}

    # Store the top NP phrases in the dictionary
    for phrase, count in top_nps:
        top_phrases_dict['nps'].append((phrase, count))
    # Store the top VP phrases in the dictionary
    for phrase, count in top_vps:
        top_phrases_dict['vps'].append((phrase, count))
    return top_phrases_dict

def compute_pmi_dataframe(df):
    """
    Compute Pointwise Mutual Information (PMI) between noun phrases (NPs) and verb phrases (VPs)
    based on the commentary in the DataFrame.

    Args:
    df (DataFrame): A pandas DataFrame containing 'PoS_tagged' and 'Commentary' columns.

    Returns:
    DataFrame: A DataFrame containing PMI values for each NP-VP pair.
    """
    commentary_corpus = []
    stop_words = {',', 'the', 'a', 'that', 's', '%'}
    phrases = {'nps': [], 'vps': []}

    # Extract NPs and VPs from each PoS-tagged commentary
    for x in df['PoS_tagged']:
        np_phrases, vp_phrases = extract_phrases(x)
        phrases['nps'].extend(np_phrases)
        phrases['vps'].extend(vp_phrases)

    # Filter out stop words from NP phrases
    phrases['nps'] = [phrase_tuple for phrase_tuple in phrases['nps'] if phrase_tuple[0].lower() not in stop_words]
    # Count occurrences of NP and VP phrases in the DataFrame
    np_counts, vp_counts = count_phrases_in_df(df, phrases)
    # Merge counts and get top phrases
    top_phrases_dict = merge_and_get_top_phrases(np_counts, vp_counts)

    nps = [phrase for phrase, _ in top_phrases_dict['nps']]
    vps = [phrase for phrase, _ in top_phrases_dict['vps']]


    # Iterate through each commentary in the 'commentary' column
    for commentary in df['Commentary']:
        # Convert the commentary to lowercase and append it to the corpus list
        commentary_corpus.append(commentary.lower())

    # Initialize a co-occurrence matrix
    co_occurrence_matrix = np.zeros((len(vps), len(nps)), dtype=int)

    # Tokenize and preprocess sentences, and extract NPs and VPs
    for sentence in commentary_corpus:
        # Update co-occurrence matrix for each sentence
        for np_index, np_phrase in enumerate(nps):
            for vp_index, vp_phrase in enumerate(vps):
                if np_phrase in sentence and vp_phrase in sentence:
                    # Increment co-occurrence count for this NP-VP pair
                    co_occurrence_matrix[vp_index, np_index] += 1
    #Add 2 smoothing
    co_occurrence_matrix += 2
    # Calculate total count
    total_count = np.sum(co_occurrence_matrix)

    # Calculate probabilities
    p_np = np.sum(co_occurrence_matrix, axis=0) / total_count  # Probability of each NP
    p_vp = np.sum(co_occurrence_matrix, axis=1) / total_count  # Probability of each VP
    p_np_vp = co_occurrence_matrix / total_count  # Joint probability of each NP-VP pair

    # Compute PMI
    pmi_matrix = np.log(p_np_vp / np.outer(p_vp, p_np))

    # Replace negative PMI values with 0
    pmi_matrix = np.maximum(pmi_matrix, 0)

    # Create a DataFrame for PMI values
    pmi_df = pd.DataFrame(pmi_matrix, index=vps, columns=nps)


    return pmi_df

In [None]:


pmidf = compute_pmi_dataframe(df)

In [None]:
# you can test your resulting matrix
def top_k_vps(pmi_matrix, np, k):
    # Check if the NP exists in the matrix
    if np in pmi_matrix.T.index:
        top_vps = pmi_matrix.T.loc[np].nlargest(k)
        return top_vps.index.tolist()
    else:
        print(f"Noun phrase '{np}' not found in PMI matrix.")
        return []
top_k_vps(pmidf, 'joao cancelo', 3)

['stops', 'parried', 'puts']