In [2]:
#Q1
#(a)
import nltk
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('maxent_ne_chunker')
#nltk.download('words')
#nltk.download('stopwords')
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

twenty_data = fetch_20newsgroups(subset='all', shuffle=True, categories=['alt.atheism','talk.religion.misc','comp.graphics','sci.space'])
data = pd.Series(twenty_data.data).astype(str)

def _removeNonAscii(s): 
    return "".join(i for i in s if (ord(i)<123 and ord(i)>96) or (ord(i)<91 and ord(i)>64) or (ord(i)==32) or (ord(i)==10))
clean_data = data.map(lambda x: _removeNonAscii(x))

STOPWORDS_DICT = {lang: set(nltk.corpus.stopwords.words(lang)) for lang in nltk.corpus.stopwords.fileids()}

def get_language(text):
    words = set(nltk.wordpunct_tokenize(text.lower()))
    lang = max(((lang, len(words & stopwords)) for lang, stopwords in STOPWORDS_DICT.items()), key = lambda x: x[1])[0]
    if lang == 'english':
        return True
    else:
        return False
english_data=clean_data[clean_data.apply(get_language)]
english_data.reset_index(inplace=True, drop=True)

token_data = []
for i in range(len(english_data)):
    if english_data[i]:
        token_data.append(word_tokenize(english_data[i]))

POS_data = []
for i in range(len(token_data)):
    temp = nltk.pos_tag(token_data[i])
    POS_data.extend(temp)

        
#(b)
from nltk.collocations import *

bigrams = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(POS_data)

bigram_freq = finder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

en_stopwords = set(stopwords.words('english'))

def rightTypes(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    if ngram[0][1] in acceptable_types and ngram[1][1] in second_type:
        return True
    else:
        return False

#Freuency filter
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]
freq_bi = filtered_bi[:20].bigram.values

#PMI
finder.apply_freq_filter(20)
bigramPMITable = pd.DataFrame(list(finder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)
filteredPMI_bi = bigramPMITable[bigramPMITable.bigram.map(lambda x: rightTypes(x))]
pmi_bi = filteredPMI_bi[:20].bigram.values

#T-test
bigramTtable = pd.DataFrame(list(finder.score_ngrams(bigrams.student_t)), columns=['bigram','t']).sort_values(by='t', ascending=False)
filteredT_bi = bigramTtable[bigramTtable.bigram.map(lambda x: rightTypes(x))]
t_bi = filteredT_bi[:20].bigram.values

#Chi-squared
bigramChiTable = pd.DataFrame(list(finder.score_ngrams(bigrams.chi_sq)), columns=['bigram','chi-sq']).sort_values(by='chi-sq', ascending=False)
filteredChi_bi = bigramChiTable[bigramChiTable.bigram.map(lambda x: rightTypes(x))]
chi_bi = filteredChi_bi[:20].bigram.values

#DataFrame showing top20 results for each method
df = pd.DataFrame({'Freq': freq_bi, 'PMI': pmi_bi, 't-test': t_bi, 'Chi-squared': chi_bi})
df

Unnamed: 0,Freq,PMI,t-test,Chi-squared
0,"((Subject, NNP), (Re, NNP))","((Evelyn, NNP), (Conlon, NNP))","((Subject, NNP), (Re, NNP))","((ALink, NNP), (KSAND, NNP))"
1,"((Organization, NNP), (University, NNP))","((Duck, NNP), (Pond, NNP))","((Organization, NNP), (University, NNP))","((Carnegie, NNP), (Mellon, NNP))"
2,"((Lines, NNP), (NNTPPostingHost, NNP))","((decaycbnewsjcbattcom, NN), (deankaflowitz, NN))","((Lines, NNP), (NNTPPostingHost, NNP))","((Cookamunga, NNP), (Tourist, NNP))"
3,"((Distribution, NNP), (world, NN))","((ancient, NN), (Mayans, NNPS))","((Distribution, NNP), (world, NN))","((Evelyn, NNP), (Conlon, NNP))"
4,"((Lines, NNP), (Distribution, NNP))","((Notre, NNP), (Dame, NNP))","((Lines, NNP), (Distribution, NNP))","((Notre, NNP), (Dame, NNP))"
5,"((world, NN), (NNTPPostingHost, NNP))","((Eau, NNP), (Claire, NNP))","((world, NN), (NNTPPostingHost, NNP))","((OriginalSender, NNP), (isuVACATIONVENARICSCM..."
6,"((Henry, NNP), (Spencer, NNP))","((Tape, NNP), (Cites, NNP))","((Henry, NNP), (Spencer, NNP))","((fait, NN), (comme, NN))"
7,"((Computer, NNP), (Science, NNP))","((Frequently, NNP), (Asked, NNP))","((Computer, NNP), (Science, NNP))","((Eau, NNP), (Claire, NNP))"
8,"((TIN, NNP), (version, NN))","((Southwestern, NNP), (Louisiana, NNP))","((TIN, NNP), (version, NN))","((Duck, NNP), (Pond, NNP))"
9,"((version, NN), (PL, NNP))","((fait, NN), (comme, NN))","((version, NN), (PL, NNP))","((Mantis, NNP), (Consultants, NNP))"


(c) There is quite a bit of overlap between the Frequency Filter method and the t-test method, so much so that both methods result in the same top 20 with 4 of them arranged differently. Meanwhile for chi-squared test and PMI methods there is a bit of overlap between but not nearly as much as between the other two.  

I think that the union of the results would only make sense if there was a lot more filtering for actual words and if the duplicates were dropped.  
    
It is also apparent that the PMI and chi-squared methods are more likely to return collocations that aren't true English words, likely because those pairs of words only appear a few times (or even only once) together and never apart as is common with names of people, businesses, etc.

In [None]:
#2
#(a)
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#Because I have already removed numbers non-letter characters I will just remove stopwords and stem the words now
ps = PorterStemmer()
stemmed = []
for i in range(len(token_data)):
    temp = []
    for w in token_data[i]:
        if w not in en_stopwords:
            temp.append(ps.stem(w).lower())
    stemmed.append(temp)

#(b)
#making a callable tokenizer function to bypass the Tfidfvectorizer's tokenization because my data is already tokenized
def tokenize(text):
    return text
vect = TfidfVectorizer(tokenizer=tokenize, lowercase=False)
X = vect.fit_transform(stemmed)

#(c)
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import confusion_matrix, accuracy_score

#I have found that the document at index 87 was deleted because it wasn't in English
target = twenty_data.target[np.arange(len(twenty_data.target))!=87]

print(X)
print(len(target))
X_train, X_test, y_train, y_test = tts(X, target, test_size=0.3)

svm = SVC(gamma='scale', kernel='rbf').fit(X_train, y_train)
pred_svm = svm.predict(X_test)
matrix_svm = confusion_matrix(y_test, pred_svm)
print("SVM confusion matrix:\n", matrix_svm)

nb = MultinomialNB().fit(X_train, y_train)
pred_nb = nb.predict(X_test)
matrix_nb = confusion_matrix(y_test, pred_nb)
print("NB confusion matrix:\n", matrix_nb)

svm_score = accuracy_score(y_test, pred_svm)
nb_score = accuracy_score(y_test, pred_nb)
print("\nSVM score:",svm_score,"  NB score:",nb_score)

svm2 = SVC(gamma='scale', kernel='linear').fit(X_train, y_train)
pred_svm2 = svm2.predict(X_test)
svm_score2 = accuracy_score(y_test, pred_svm2)
print("\nkernel scores:\nrbf:",svm_score, "linear:",svm_score2)
print("Yes the different kernels do affect the accuracy, especially between the linear (highest score) and rbf (default)")


#(d)
tokens = []
for i in range(len(data)):
    tokens.append(word_tokenize(data[i]))

POS = []
for i in range(len(tokens)):
    temp = nltk.pos_tag(tokens[i])
    POS.extend(temp)
    
def correctTypes(text):
    if '-pron-' in text or '' in text or ' 'in text or 't' in text:
        return False
    for word in text:
        if word in en_stopwords:
            return False
    acceptable_types = ('NN', 'NNS', 'NNP', 'NNPS')
    if text[1] in acceptable_types:
        return True
    else:
        return False
POS = pd.Series(POS)
POS = POS.map(lambda x: _removeNonAscii(x))
POS = POS[POS.map(lambda x: correctTypes(x))]
POS = POS.tolist()
    
stems = []
for i in range(len(POS)):
    temp = []
    for w in POS[i]:
        if w not in en_stopwords:
            temp.append(ps.stem(w).lower())
    stems.append(temp)

print(stems)
X2 = vect.fit_transform(stems)

#Repeat of question 'c'
X_train2, X_test2, y_train2, y_test2 = tts(X2, target, test_size=0.3)

svm_2 = SVC(gamma='scale', kernel='rbf').fit(X_train2, y_train2)
pred_svm_2 = svm_2.predict(X_test2)
matrix_svm_2 = confusion_matrix(y_test2, pred_svm_2)
print("SVM confusion matrix:\n", matrix_svm_2)

nb_2 = MultinomialNB().fit(X_train2, y_train2)
pred_nb_2 = nb_2.predict(X_test2)
matrix_nb_2 = confusion_matrix(y_test2, pred_nb_2)
print("NB confusion matrix:\n", matrix_nb)

svm_score_2 = accuracy_score(y_test2, pred_svm_2)
nb_score_2 = accuracy_score(y_test2, pred_nb_2)
print("\nSVM score:",svm_score_2,"  NB score:",nb_score_2)

#The accuracy for the text classification is much higher when only the nouns are being used
print("\n# of words with nouns:", len(POS_data), "   # of words without nouns:", len(POS))