# Data Science Club Spring 2023 - Computational Linguistics Workshop 
## Speaker: Isaiah Stapleton

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd
import random
import ast
from nltk.sentiment import SentimentIntensityAnalyzer
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim_models as gensimvis
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.utils import get_stop_words


# Corpus

In [2]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
corpus = nltk.corpus.gutenberg.words('shakespeare-caesar.txt')

In [4]:
corpus = " ".join(corpus)

In [5]:
corpus



# Text Preprocessing 

## Tokenization

In [6]:
tokens = nltk.word_tokenize(corpus)

In [7]:
tokens

['[',
 'The',
 'Tragedie',
 'of',
 'Julius',
 'Caesar',
 'by',
 'William',
 'Shakespeare',
 '1599',
 ']',
 'Actus',
 'Primus',
 '.',
 'Scoena',
 'Prima',
 '.',
 'Enter',
 'Flauius',
 ',',
 'Murellus',
 ',',
 'and',
 'certaine',
 'Commoners',
 'ouer',
 'the',
 'Stage',
 '.',
 'Flauius',
 '.',
 'Hence',
 ':',
 'home',
 'you',
 'idle',
 'Creatures',
 ',',
 'get',
 'you',
 'home',
 ':',
 'Is',
 'this',
 'a',
 'Holiday',
 '?',
 'What',
 ',',
 'know',
 'you',
 'not',
 '(',
 'Being',
 'Mechanicall',
 ')',
 'you',
 'ought',
 'not',
 'walke',
 'Vpon',
 'a',
 'labouring',
 'day',
 ',',
 'without',
 'the',
 'signe',
 'Of',
 'your',
 'Profession',
 '?',
 'Speake',
 ',',
 'what',
 'Trade',
 'art',
 'thou',
 '?',
 'Car',
 '.',
 'Why',
 'Sir',
 ',',
 'a',
 'Carpenter',
 'Mur',
 '.',
 'Where',
 'is',
 'thy',
 'Leather',
 'Apron',
 ',',
 'and',
 'thy',
 'Rule',
 '?',
 'What',
 'dost',
 'thou',
 'with',
 'thy',
 'best',
 'Apparrell',
 'on',
 '?',
 'You',
 'sir',
 ',',
 'what',
 'Trade',
 'are',
 'you',


## Stopword Removal

In [8]:
stop_words = set(nltk.corpus.stopwords.words('english'))

In [9]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [10]:
len(corpus)

114804

In [11]:
corpus = [word for word in tokens if word.lower() not in stop_words]

In [12]:
corpus = " ".join(corpus)

In [13]:
len(corpus)

78782

In [14]:
corpus



## Lemmatization 

In [15]:
text = "The cats were chasing the mice, but then they stopped because they were tired."

In [16]:
wordNetLem = nltk.WordNetLemmatizer()

In [17]:
tokens = nltk.word_tokenize(text)

In [18]:
tokens

['The',
 'cats',
 'were',
 'chasing',
 'the',
 'mice',
 ',',
 'but',
 'then',
 'they',
 'stopped',
 'because',
 'they',
 'were',
 'tired',
 '.']

In [19]:
lemmatized = [wordNetLem.lemmatize(word) for word in tokens]

In [20]:
lemmatized

['The',
 'cat',
 'were',
 'chasing',
 'the',
 'mouse',
 ',',
 'but',
 'then',
 'they',
 'stopped',
 'because',
 'they',
 'were',
 'tired',
 '.']

## Stemming

In [21]:
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

In [22]:
[stem(t) for t in tokens]

['The',
 'cat',
 'were',
 'chas',
 'the',
 'mice',
 ',',
 'but',
 'then',
 'they',
 'stopp',
 'because',
 'they',
 'were',
 'tir',
 '.']

# Sentiment Analysis 

In [23]:
text = "I love the beach, it's so beautiful and peaceful." 

In [24]:
sa = SentimentIntensityAnalyzer()

In [25]:
sentiment = sa.polarity_scores(text)

In [26]:
print(sentiment)

{'neg': 0.0, 'neu': 0.283, 'pos': 0.717, 'compound': 0.9281}


# Text Similarity 

In [27]:
def jaccard_similarity(text1, text2):
    set1 = set(text1.split())
    set2 = set(text2.split())
    similarity = len(set1.intersection(set2)) / len(set1.union(set2))
    return similarity

In [28]:
text1 = "The quick brown fox jumps over the lazy dog"
text2 = "A quick brown dog jumps over the lazy fox"

In [29]:
similarity = jaccard_similarity(text1, text2)

In [30]:
print("Jaccard similarity:", similarity)

Jaccard similarity: 0.8


# Topic Modeling

In [31]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

In [32]:
documents = [doc1, doc2, doc3, doc4, doc5]
stop_words = set(gensim.parsing.preprocessing.STOPWORDS)
texts = [[word for word in document.lower().split() if word not in stop_words] for document in documents]

In [33]:
dictionary = corpora.Dictionary(texts)

In [34]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [35]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)

In [36]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)

  default_term_info = default_term_info.sort_values(


In [37]:
vis

PreparedData(topic_coordinates=              x         y  topics  cluster       Freq
topic                                                
0     -0.092646 -0.017806       1        1  44.385205
2      0.069337 -0.044857       2        1  36.036093
1      0.023309  0.062664       3        1  19.578703, topic_info=        Term      Freq     Total Category  logprob  loglift
0        bad  0.000000  0.000000  Default  30.0000  30.0000
6     sugar,  0.000000  0.000000  Default  29.0000  29.0000
1   consume.  0.000000  0.000000  Default  28.0000  28.0000
3      likes  0.000000  0.000000  Default  27.0000  27.0000
2    father.  0.000000  0.000000  Default  26.0000  26.0000
..       ...       ...       ...      ...      ...      ...
7      dance  0.135562  1.129920   Topic3  -3.9511  -0.4897
12    spends  0.135561  1.129921   Topic3  -3.9511  -0.4898
13      time  0.135559  1.129921   Topic3  -3.9511  -0.4898
9     father  0.135646  1.685889   Topic3  -3.9505  -0.8893
8    driving  0.135615  1.7

# Summarization 

In [38]:
text = "The United States, UK and other Western allies are scrutinising China's growing influence on a host of international organisations including the United Nations, where China is increasingly flexing its muscles in peacekeeping and humanitarian operations. China is also dominating agencies including the World Health Organization and the International Civil Aviation Organization. China's weight in these organisations is in contrast to a drop in US influence under President Donald Trump. China's influence at the UN is amplified by US disengagement under President Trump. One particular example is the World Intellectual Property Organization, where Beijing-backed candidate Daren Tang was elected director-general in March, beating out US-backed candidates. The US has complained that China is undercutting global standards for intellectual property rights."

In [39]:
parser = PlaintextParser.from_string(text, Tokenizer("english"))

In [40]:
summarizer = LsaSummarizer()
summarizer.stop_words = get_stop_words("english")
summary = summarizer(parser.document, sentences_count=2)

In [41]:
summary

(<Sentence: The United States, UK and other Western allies are scrutinising China's growing influence on a host of international organisations including the United Nations, where China is increasingly flexing its muscles in peacekeeping and humanitarian operations.>,
 <Sentence: One particular example is the World Intellectual Property Organization, where Beijing-backed candidate Daren Tang was elected director-general in March, beating out US-backed candidates.>)

# Source Code Summarization - Naive Classification

In [42]:
def function_features(function):
    
    features = {}
    
    features["function_name"] = function.replace("_","").lower()
    
    features["parts_of_speech"] = tuple(nltk.pos_tag(nltk.word_tokenize(function.replace("_"," ").lower())))
    
    return features

In [43]:
data = pd.read_csv("mldata.csv")

In [44]:
data = pd.DataFrame.dropna(data)

In [45]:
function_names = data["Function Name"].tolist()

In [46]:
function_class = data["Statement"].tolist()

In [47]:
labeled_functions = [(a) for a in zip(function_names, function_class)]

In [48]:
random.shuffle(labeled_functions)

In [49]:
featuresets = [(function_features(function_names), function_class) for (function_names,function_class) in labeled_functions]

In [50]:
train_set = featuresets[:28]

In [51]:
test_set = featuresets[28:]

In [52]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [53]:
classifier.classify(function_features('twoplustwo'))

'Performs a calculation'

In [54]:
nltk.classify.accuracy(classifier, test_set)

0.4

In [55]:
def showInfo(functionNode):
    print("Method name:", functionNode.name)
    print("Arguments: ", end="")

    args = []
    for arg in functionNode.args.args:
        args.append(arg.arg)

    print(", ".join(args))
    print()

In [56]:
sourceFileName = input("Enter name of source file: ")

In [57]:
with open(sourceFileName) as file:
    node = ast.parse(file.read())

In [58]:
functions = [n for n in node.body if isinstance(n, ast.FunctionDef)]

In [59]:
classes = [n for n in node.body if isinstance(n, ast.ClassDef)]

In [60]:
for function in functions:
    showInfo(function)

Method name: parse_args
Arguments: argv

Method name: readCsv
Arguments: path

Method name: writeCsv
Arguments: data, LANGUAGE, SENTENCES_COUNT

Method name: processCsv
Arguments: path, LANGUAGE, SENTENCES_COUNT

Method name: main
Arguments: argv



In [61]:
methods = None

In [62]:
for c in classes:
    print("Class name:", c.name)

    methods = [n for n in c.body if isinstance(n, ast.FunctionDef)]

    for method in methods:
        showInfo(method)

In [63]:
for function in functions:
    print(function.name + ": " + classifier.classify(function_features(function.name)))

parse_args: Performs a calculation
readCsv: Performs a calculation
writeCsv: Performs a calculation
processCsv: Performs a calculation
main: Performs a calculation


In [64]:
if methods != None:
    for function in methods:
        print(function.name + ": " + classifier.classify(function_features(function.name)))