<a href="https://colab.research.google.com/github/MK316/Spring2024/blob/main/Corpus/TEDdata/NLTK_example_spokenwritten.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLTK analysis example: spoken vs. written comparison (0605-updated)

In [None]:
!pip install nltk

In [None]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
import matplotlib.pyplot as plt

# Download the necessary NLTK data
nltk.download('punkt')

# Sample data
spoken_text = "Well, um, you know, I think we should, uh, go to the park. Don't you agree?"
written_text = "It is suggested that we should go to the park. What do you think?"

# Tokenize text
spoken_words = word_tokenize(spoken_text)
written_words = word_tokenize(written_text)

# Count pronouns
pronouns = set(["I", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them"])
spoken_pronouns = sum(1 for word in spoken_words if word.lower() in pronouns)
written_pronouns = sum(1 for word in written_words if word.lower() in pronouns)

# Count contractions
contractions = set(["don't", "can't", "won't", "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't", "hadn't", "didn't", "doesn't", "wouldn't", "couldn't", "shouldn't", "mightn't", "mustn't"])
spoken_contractions = sum(1 for word in spoken_words if word.lower() in contractions)
written_contractions = sum(1 for word in written_words if word.lower() in contractions)

# Count fillers
fillers = set(["um", "uh", "you know", "well"])
spoken_fillers = sum(1 for word in spoken_words if word.lower() in fillers)
written_fillers = sum(1 for word in written_words if word.lower() in fillers)

# Count simple sentence structures
spoken_sentences = sent_tokenize(spoken_text)
written_sentences = sent_tokenize(written_text)
spoken_simple_sentences = sum(1 for sentence in spoken_sentences if len(word_tokenize(sentence)) < 10)
written_simple_sentences = sum(1 for sentence in written_sentences if len(word_tokenize(sentence)) < 10)

# Results
results = {
    'Feature': ['Pronouns', 'Contractions', 'Fillers', 'Simple Sentences'],
    'Spoken': [spoken_pronouns, spoken_contractions, spoken_fillers, spoken_simple_sentences],
    'Written': [written_pronouns, written_contractions, written_fillers, written_simple_sentences]
}

df = pd.DataFrame(results)

# Display the DataFrame
print(df)

# Plotting the results
# Set positions and width for the bars
pos = list(range(len(df['Spoken'])))
width = 0.25

# Plotting the bars
fig, ax = plt.subplots(figsize=(10, 6))

plt.bar(pos, df['Spoken'], width, alpha=0.5, color='b', label='Spoken')
plt.bar([p + width for p in pos], df['Written'], width, alpha=0.5, color='r', label='Written')

# Setting the y-axis label
ax.set_ylabel('Counts')

# Setting the x-axis labels
ax.set_xticks([p + width / 2 for p in pos])
ax.set_xticklabels(df['Feature'])

# Setting the chart's title
ax.set_title('Comparison of Spoken and Written Language Features')

# Adding the legend and showing the plot
plt.legend(['Spoken', 'Written'], loc='upper left')
plt.grid()
plt.show()


# POS tagging using nltk

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Download the necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Sample text
text = "Well, um, you know, I think we should, uh, go to the park. Don't you agree?"

# Tokenize the text
tokens = word_tokenize(text)

# Tag the tokens
tagged_tokens = pos_tag(tokens)

# Display the tagged tokens
print(tagged_tokens)


In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter

# Sample DataFrame for demonstration (replace this with your actual DataFrame)
data = {
    'Text': [
        "Well, um, you know, I think we should, uh, go to the park. Don't you agree?",
        "It is suggested that we should go to the park. What do you think?"
    ]
}
df = pd.DataFrame(data)

# Download the necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Function to tag text
def tag_text(text):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    return tagged_tokens

# Apply the tag_text function to each element in the 'Text' column
df['Tagged'] = df['Text'].apply(tag_text)

# Function to count specific tags
def count_tags(tagged_text, tag_prefix):
    tags = [tag for word, tag in tagged_text if tag.startswith(tag_prefix)]
    return len(tags)

# Count the number of nouns (NN) in each row
df['Noun_Count'] = df['Tagged'].apply(lambda x: count_tags(x, 'NN'))

# Count the number of verbs (VB) in each row
df['Verb_Count'] = df['Tagged'].apply(lambda x: count_tags(x, 'VB'))

# Display the DataFrame with tag counts
print(df)


### Apply to a real data:

 + File to upload

In [None]:
from google.colab import files

# This will prompt you to upload a file
uploaded = files.upload()

In [None]:
# Assuming the uploaded file is named 'example.txt'
filename = 'sampleTED.txt'

# Open and read the file
with open(filename, 'r') as file:
    file_contents = file.read()

# Print the file contents
print(file_contents[:100])


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Download the necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Sample text
text = file_contents

# Tokenize the text
tokens = word_tokenize(text)

# Tag the tokens
tagged_tokens = pos_tag(tokens)

# Display the tagged tokens
print(tagged_tokens[-100:])


In [None]:
# saving a list file as dataframe
import pandas as pd

df = pd.DataFrame(tagged_tokens, columns=['Word', 'Tagged'])

# Save the DataFrame to a CSV file
df.to_csv('tagged_tokens.csv', index=False)

# Display the DataFrame to verify
print(df)

# Optional: Download the CSV file in Google Colab
# from google.colab import files
# files.download('tagged_tokens.csv')

## Past tense verb

+ [tag documentation](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)

+ Data should be a dataframe, having 'Text' column (or change the column name in the code below)

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv("Cleanedtext01.csv")

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter

# Download the necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Function to tag text
def tag_text(text):
    if isinstance(text, str):
        tokens = word_tokenize(text)
        tagged_tokens = pos_tag(tokens)
        return tagged_tokens
    else:
        # Handle the case where text is not a string or bytes-like object
        return []



# Apply the tag_text function to each element in the 'Cleanedtext01' column
df['Tagged'] = df['Cleanedtext01'].apply(tag_text)

# Function to count specific tags
def count_tags(tagged_text, tag):
    tags = [tag for word, tag in tagged_text if tag == tag]
    return len(tags)

# Count the number of past tense verbs (VBD) in each row
df['Past_Tense_Verb_Count'] = df['Tagged'].apply(lambda x: count_tags(x, 'VBD'))

# Display the DataFrame with tag counts
df


+ Past tense verb list as a separate file

In [None]:
import pandas as pd

# Function to extract VBP tokens
def extract_vbp_tokens(row):
    tid = row['TID']
    tagged_tokens = row['Tagged']
    vbp_tokens = [(tid, word) for word, tag in tagged_tokens if tag == 'VBP']
    return vbp_tokens

# Extract VBP tokens for each row and flatten the list of lists
vbp_list = [item for sublist in df.apply(extract_vbp_tokens, axis=1) for item in sublist]

# Create a new DataFrame with TID and Word columns
vbp_df = pd.DataFrame(vbp_list, columns=['TID', 'Word'])

# Display the new DataFrame
print(vbp_df)

# Save the new DataFrame to a CSV file
vbp_df.to_csv('vbp_tokens.csv', index=False)

# Optional: Download the CSV file in Google Colab
# from google.colab import files
# files.download('vbp_tokens.csv')


In [None]:
import nltk
from nltk.corpus import wordnet as wn

# Download the necessary NLTK data
nltk.download('tagsets')
nltk.download('wordnet')

# Display the description of the Penn Treebank POS tagset
nltk.help.upenn_tagset()

# Get descriptions of specific POS tags
tag_descriptions = {
    'NN': wn.synset('noun.n.01').definition(),
    'VB': wn.synset('verb.n.01').definition(),
    'JJ': wn.synset('adjective.n.01').definition(),
    'RB': wn.synset('adverb.n.01').definition()
}

print(tag_descriptions)


---
The End