<a href="https://colab.research.google.com/github/MK316/Spring2024/blob/main/Corpus/TEDdata/NLTK_example_spokenwritten.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk

In [None]:
nltk.download('punkt')

In [None]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
import matplotlib.pyplot as plt

# Download the necessary NLTK data
nltk.download('punkt')

# Sample data
spoken_text = "Well, um, you know, I think we should, uh, go to the park. Don't you agree?"
written_text = "It is suggested that we should go to the park. What do you think?"

# Tokenize text
spoken_words = word_tokenize(spoken_text)
written_words = word_tokenize(written_text)

# Count pronouns
pronouns = set(["I", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them"])
spoken_pronouns = sum(1 for word in spoken_words if word.lower() in pronouns)
written_pronouns = sum(1 for word in written_words if word.lower() in pronouns)

# Count contractions
contractions = set(["don't", "can't", "won't", "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't", "hadn't", "didn't", "doesn't", "wouldn't", "couldn't", "shouldn't", "mightn't", "mustn't"])
spoken_contractions = sum(1 for word in spoken_words if word.lower() in contractions)
written_contractions = sum(1 for word in written_words if word.lower() in contractions)

# Count fillers
fillers = set(["um", "uh", "you know", "well"])
spoken_fillers = sum(1 for word in spoken_words if word.lower() in fillers)
written_fillers = sum(1 for word in written_words if word.lower() in fillers)

# Count simple sentence structures
spoken_sentences = sent_tokenize(spoken_text)
written_sentences = sent_tokenize(written_text)
spoken_simple_sentences = sum(1 for sentence in spoken_sentences if len(word_tokenize(sentence)) < 10)
written_simple_sentences = sum(1 for sentence in written_sentences if len(word_tokenize(sentence)) < 10)

# Results
results = {
    'Feature': ['Pronouns', 'Contractions', 'Fillers', 'Simple Sentences'],
    'Spoken': [spoken_pronouns, spoken_contractions, spoken_fillers, spoken_simple_sentences],
    'Written': [written_pronouns, written_contractions, written_fillers, written_simple_sentences]
}

df = pd.DataFrame(results)

# Display the DataFrame
print(df)

# Plotting the results
# Set positions and width for the bars
pos = list(range(len(df['Spoken'])))
width = 0.25

# Plotting the bars
fig, ax = plt.subplots(figsize=(10, 6))

plt.bar(pos, df['Spoken'], width, alpha=0.5, color='b', label='Spoken')
plt.bar([p + width for p in pos], df['Written'], width, alpha=0.5, color='r', label='Written')

# Setting the y-axis label
ax.set_ylabel('Counts')

# Setting the x-axis labels
ax.set_xticks([p + width / 2 for p in pos])
ax.set_xticklabels(df['Feature'])

# Setting the chart's title
ax.set_title('Comparison of Spoken and Written Language Features')

# Adding the legend and showing the plot
plt.legend(['Spoken', 'Written'], loc='upper left')
plt.grid()
plt.show()
