# CS 598 PSL Project 3: approach based on Campuswire post [628](https://campuswire.com/c/G06C55090/feed/628)

In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
#from nltk import word_tokenize          
#from nltk.stem import PorterStemmer
#from nltk.stem import WordNetLemmatizer 

SEED = 4031
np.random.seed(SEED)

In [2]:
#from nltk import download
#download("punkt")
#download('wordnet')

In [3]:
# Pull in datasets
train_datasets = []
test_datasets = []
test_ys = []

num_folds = 5

for fold in range(num_folds):
    folder = f"split_{fold+1}/"
    train_datasets.append(pd.read_csv(folder + "train.tsv", sep="\t"))
    test_datasets.append(pd.read_csv(folder + "test.tsv", sep="\t"))
    test_ys.append(pd.read_csv(folder + "test_y.tsv", sep="\t"))

In [4]:
# Custom stopword list
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "their", "they", "his", \
             "her", "she", "he", "a", "an", "and", "is", "was", "are", "were", "him", "himself", "has", "have", "it", "its", \
             "the", "us"]

## Construct vocabulary

In [70]:
# Use all training data to construct vocabulary.

all_train = pd.concat(train_datasets, axis=0, ignore_index=True)
all_train.drop(columns=["id"], inplace=True)
all_train

#all_train = pd.DataFrame()

#for train_df in train_datasets:
#    all_train = pd.concat([all_train, train_df], axis=0)

Unnamed: 0,sentiment,review
0,1,naturally in a film who is main themes are of ...
1,0,afraid of the dark left me with the impression...
2,0,this has to be one of the biggest misfires eve...
3,0,"this is one of those movies i watched, and won..."
4,0,this movie was dreadful. biblically very inacc...
...,...,...
124995,0,"I am a student of film, and have been for seve..."
124996,0,It seems like more consideration has gone into...
124997,0,I don't believe they made this film. Completel...
124998,0,This 30 minute documentary Buñuel made in the ...


### Preprocess text

In [72]:
# Remove HTML tags and convert to lowercase
all_train["review"] = all_train["review"].str.replace('<.*?>', ' ', regex=True)
# Convert all strings to lowercase
all_train["review"] = all_train["review"].str.lower()

In [73]:
def expand_contractions(reviews):
    
    """
    Routine to expand English contractions, like "isn't" --> "is not".
    This is because "isn't good" and "wasn't good" will both expand to produce the bi-gram "not good".
    The pooled phrase should have more predictive power than the original two phrases.
    """

    # Dictionary of English contractions. Taken from StackOverflow post, which borrowed it from Wikipedia:
    # https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python

    contractions = { 
        "\\bain't\\b": "am not",
        "\\baren't\\b": "are not",
        "\\bcan't\\b": "cannot",
        "\\bcan't've\\b": "cannot have",
        "\\b'cause\\b": "because",
        "\\bcould've\\b": "could have",
        "\\bcouldn't\\b": "could not",
        "\\bcouldn't've\\b": "could not have",
        "\\bdidn't\\b": "did not",
        "\\bdoesn't\\b": "does not",
        "\\bdon't\\b": "do not",
        "\\bhadn't\\b": "had not",
        "\\bhadn't've\\b": "had not have",
        "\\bhasn't\\b": "has not",
        "\\bhaven't\\b": "have not",
        "\\bhe'd\\b": "he would",
        "\\bhe'd've\\b": "he would have",
        "\\bhe'll\\b": "he will",
        "\\bhe'll've\\b": "he will have",
        "\\bhe's\\b": "he is",
        "\\bhow'd\\b": "how did",
        "\\bhow'd'y\\b": "how do you",
        "\\bhow'll\\b": "how will",
        "\\bhow's\\b": "how is",
        "\\bi'd\\b": "i would",
        "\\bi'd've\\b": "i would have",
        "\\bi'll\\b": "i will",
        "\\bi'll've\\b": "i will have",
        "\\bi'm\\b": "i am",
        "\\bi've\\b": "i have",
        "\\bisn't\\b": "is not",
        "\\bit'd\\b": "it would",
        "\\bit'd've\\b": "it would have",
        "\\bit'll\\b": "it will",
        "\\bit'll've\\b": "it will have",
        "\\bit's\\b": "it is",
        "\\blet's\\b": "let us",
        "\\bma'am\\b": "madam",
        "\\bmayn't\\b": "may not",
        "\\bmight've\\b": "might have",
        "\\bmightn't\\b": "might not",
        "\\bmightn't've\\b": "might not have",
        "\\bmust've\\b": "must have",
        "\\bmustn't\\b": "must not",
        "\\bmustn't've\\b": "must not have",
        "\\bneedn't\\b": "need not",
        "\\bneedn't've\\b": "need not have",
        "\\bo'clock\\b": "of the clock",
        "\\boughtn't\\b": "ought not",
        "\\boughtn't've\\b": "ought not have",
        "\\bshan't\\b": "shall not",
        "\\bsha'n't\\b": "shall not",
        "\\bshan't've\\b": "shall not have",
        "\\bshe'd\\b": "she would",
        "\\bshe'd've\\b": "she would have",
        "\\bshe'll\\b": "she will",
        "\\bshe'll've\\b": "she will have",
        "\\bshe's\\b": "she is",
        "\\bshould've\\b": "should have",
        "\\bshouldn't\\b": "should not",
        "\\bshouldn't've\\b": "should not have",
        "\\bso've\\b": "so have",
        "\\bso's\\b": "so is",
        "\\bthat'd\\b": "that would",
        "\\bthat'd've\\b": "that would have",
        "\\bthat's\\b": "that is",
        "\\bthere'd\\b": "there would",
        "\\bthere'd've\\b": "there would have",
        "\\bthere's\\b": "there is",
        "\\bthey'd\\b": "they would",
        "\\bthey'd've\\b": "they would have",
        "\\bthey'll\\b": "they will",
        "\\bthey'll've\\b": "they will have",
        "\\bthey're\\b": "they are",
        "\\bthey've\\b": "they have",
        "\\bto've\\b": "to have",
        "\\bwasn't\\b": "was not",
        "\\bwe'd\\b": "we would",
        "\\bwe'd've\\b": "we would have",
        "\\bwe'll\\b": "we will",
        "\\bwe'll've\\b": "we will have",
        "\\bwe're\\b": "we are",
        "\\bwe've\\b": "we have",
        "\\bweren't\\b": "were not",
        "\\bwhat'll\\b": "what will",
        "\\bwhat'll've\\b": "what will have",
        "\\bwhat're\\b": "what are",
        "\\bwhat's\\b": "what is",
        "\\bwhat've\\b": "what have",
        "\\bwhen's\\b": "when is",
        "\\bwhen've\\b": "when have",
        "\\bwhere'd\\b": "where did",
        "\\bwhere's\\b": "where is",
        "\\bwhere've\\b": "where have",
        "\\bwho'll\\b": "who will",
        "\\bwho'll've\\b": "who will have",
        "\\bwho's\\b": "who is",
        "\\bwho've\\b": "who have",
        "\\bwhy's\\b": "why is",
        "\\bwhy've\\b": "why have",
        "\\bwill've\\b": "will have",
        "\\bwon't\\b": "will not",
        "\\bwon't've\\b": "will not have",
        "\\bwould've\\b": "would have",
        "\\bwouldn't\\b": "would not",
        "\\bwouldn't've\\b": "would not have",
        "\\by'all\\b": "you all",
        "\\by'all'd\\b": "you all would",
        "\\by'all'd've\\b": "you all would have",
        "\\by'all're\\b": "you all are",
        "\\by'all've\\b": "you all have",
        "\\byou'd\\b": "you would",
        "\\byou'd've\\b": "you would have",
        "\\byou'll\\b": "you will",
        "\\byou'll've\\b": "you will have",
        "\\byou're\\b": "you are",
        "\\byou've\\b": "you have"
    }
    
    # Replace all contractions in all reviews.
    for contraction in contractions:
        reviews = reviews.str.replace(contraction, contractions[contraction], regex=True)
        
    return reviews

In [74]:
# Expand English contractions
all_train["review"] = expand_contractions(all_train["review"])

In [75]:
# Vectorize the reviews

vectorizer = CountVectorizer(
    #preprocessor=lambda x: x.lower(), # Convert to lowercase
    stop_words=stopwords,             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                     # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r"\b[\w+\|']+\b"    # Use word tokenizer, but don't split on apostrophes
)

dtm_train = vectorizer.fit_transform(all_train["review"])

In [90]:
# View the number of ngrams
feature_ngrams = vectorizer.get_feature_names_out()
feature_ngrams.shape

(31701,)

In [91]:
# Output features to file
np.savetxt("all_train_features.txt", feature_ngrams, fmt="%s", delimiter=",")

In [121]:
# Routine to preprocess text: strip out HTML, convert to lowercase, and expand English contractions.

def preprocess_reviews(reviews):
    reviews = reviews.str.replace('<.*?>', ' ', regex=True)
    reviews = reviews.str.lower()
    reviews = expand_contractions(reviews)
    
    return reviews

### Use t-test to identify strongest 2000 positive and negative terms

In [85]:
# Try t-test to identify terms that are strongly associated with only positive or only negative reviews.

dtm_array = dtm_train.toarray()
dtm_pos = dtm_array[all_train.sentiment == 1, :]
dtm_neg = dtm_array[all_train.sentiment == 0, :]

In [86]:
dtm_pos_count = dtm_pos.shape[0]
dtm_neg_count = dtm_neg.shape[0]
dtm_pos_count, dtm_neg_count

(62385, 62615)

In [87]:
dtm_pos_means = np.mean(dtm_pos, axis=0)
dtm_pos_vars = np.var(dtm_pos, axis=0, ddof=1)
dtm_pos_means.shape

(31701,)

In [88]:
dtm_neg_means = np.mean(dtm_neg, axis=0)
dtm_neg_vars = np.var(dtm_neg, axis=0, ddof=1)

In [89]:
# For each term / ngram, compute t-statistic for two independent samples.
# Hmmm...they're not independent, but we can't really pool the variance...

t_statistics = (dtm_pos_means - dtm_neg_means) / np.sqrt((dtm_pos_vars/dtm_pos_count) + (dtm_neg_vars/dtm_neg_count))
t_statistics.shape

(31701,)

In [92]:
feature_statistic_df = pd.DataFrame({"feature": feature_ngrams.tolist(), "statistic": t_statistics.tolist()})

In [93]:
# Look at top 50 positive words
feature_statistic_df.sort_values(by="statistic", ascending=False).iloc[0:50, :]

Unnamed: 0,feature,statistic
10874,great,73.982716
8232,excellent,58.862798
31186,wonderful,53.246837
3247,best,53.155283
17806,of best,51.324386
19152,one of best,48.93239
14764,love,43.82975
20103,perfect,41.751995
1406,amazing,40.271278
2936,beautiful,39.589658


In [95]:
# Look at bottom 50 words (most negative)
feature_statistic_df.sort_values(by="statistic").iloc[0:50, :]

Unnamed: 0,feature,statistic
2537,bad,-94.755462
31315,worst,-84.764359
29641,waste,-68.997657
2465,awful,-64.850629
17113,not even,-58.121304
25264,terrible,-57.600314
31306,worse,-53.553785
3598,boring,-52.661545
24670,stupid,-50.814659
16835,no,-50.695585


In [97]:
# How many terms meet the 0.05 significance threshold?
len(feature_statistic_df[feature_statistic_df.statistic >= 1.645])

11752

In [96]:
# Check for class imbalance
all_train.groupby(["sentiment"]).count()

Unnamed: 0_level_0,review
sentiment,Unnamed: 1_level_1
0,62615
1,62385


In [99]:
# Keep the top 2000 terms by magnitude of t-statistic

n_terms = 2000

feature_statistic_df["abs_statistic"] = abs(feature_statistic_df["statistic"])

top_features = feature_statistic_df.sort_values(by="abs_statistic", ascending=False).iloc[:n_terms, 0]
top_features

2537                     bad
31315                  worst
10874                  great
29641                  waste
2465                   awful
                ...         
24033                sounded
16860              no excuse
26662    this movie horrible
24173                spot on
17214            not in good
Name: feature, Length: 2000, dtype: object

In [103]:
# Add any terms that only appeared in positive reviews or only in negative reviews.
only_positive = feature_ngrams[np.logical_and((dtm_pos_means > 0), (dtm_neg_means == 0))]
only_negative = feature_ngrams[np.logical_and((dtm_pos_means == 0), (dtm_neg_means > 0))]
only_negative

array(['avoid like', 'bad as this', 'bad music', 'could not save',
       'crap like this', 'do not waste money', 'easily worst', 'gave 2',
       'how not to make', 'instead of 1', 'manos', 'not funny not',
       'not waste money on', 'not waste time or', 'this by far worst',
       'this drivel', 'this dull', 'this junk', 'this lame',
       'this piece of garbage', 'this rubbish', 'this stinker',
       'this tripe', 'this turkey', 'this waste', 'this waste of',
       'waste time or', 'waste time or money', 'waste time with this',
       'worst movie ever made', 'worst movies ever seen'], dtype=object)

In [104]:
top_features = list(set(top_features.tolist() + only_positive.tolist() + only_negative.tolist()))
top_features

['really wanted',
 'not to be missed',
 'as good',
 'rest of',
 'heartbreaking',
 'wooden',
 'credibility',
 'stay away from',
 'f',
 'if not seen',
 'performances of',
 'this crap',
 'sucks',
 'watch again',
 'waste of film',
 'haunting',
 'tried',
 'save yourself',
 'horror movie',
 'this movie bad',
 'meaningless',
 'cardboard',
 'most annoying',
 'funny not',
 'tells',
 'turkey',
 'modern',
 'than this',
 'this rubbish',
 'loves',
 'cash in on',
 'no excuse',
 'to live',
 'hated',
 'plan 9 from',
 'may not be',
 'in first place',
 'supporting cast',
 'keeps',
 'awfulness',
 'based on true story',
 'killed',
 'musical',
 'nothing to',
 'for those',
 'to sit through',
 'to recommend',
 'one of all',
 'surprised',
 'obvious',
 'non existent',
 'bad good',
 'far worst',
 'portrayal of',
 'tale',
 'john',
 'only good thing about',
 'tense',
 'bad if',
 'hour',
 'to be missed',
 'only redeeming',
 'excuse for',
 'how not to',
 'funny at all',
 'one of best of',
 '1 out of 10',
 'would no

In [105]:
len(top_features)

2009

### Find best ridge-regression model parameters

In [113]:
# Vectorize the full training set using the top 2000 features

top_feature_vectorizer = CountVectorizer(
    vocabulary=top_features,          # The top 200 features
    stop_words=stopwords,             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                     # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r"\b[\w+\|']+\b"    # Use word tokenizer, but don't split on apostrophes
)

dtm_vocab_train = top_feature_vectorizer.fit_transform(all_train["review"])

In [107]:
from sklearn.linear_model import LogisticRegressionCV

In [114]:
grid_search = LogisticRegressionCV(Cs=10, cv=5, penalty="l2", scoring="roc_auc", max_iter=100000, random_state=SEED, verbose=1)

In [115]:
all_train_y = all_train["sentiment"]

In [116]:
grid_search.fit(dtm_vocab_train, all_train_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.9min finished


In [125]:
best_C = grid_search.C_[0]
best_C   # best_C is 166.81005372

166.81005372000558

In [120]:
grid_search.scores_

{1: array([[0.90935775, 0.94354946, 0.96294727, 0.97165258, 0.97480854,
         0.97552131, 0.97560374, 0.97560812, 0.97560816, 0.97560816],
        [0.91301104, 0.9465028 , 0.96414   , 0.97174612, 0.97448715,
         0.97511537, 0.97520611, 0.9752191 , 0.97521909, 0.97521908],
        [0.91137743, 0.94582756, 0.96427532, 0.97222808, 0.97494514,
         0.97547124, 0.97554414, 0.97555846, 0.97555846, 0.97555859],
        [0.91270893, 0.9457004 , 0.96344614, 0.97148555, 0.97442697,
         0.97511359, 0.97520342, 0.97520149, 0.97519389, 0.97519399],
        [0.91288587, 0.94584362, 0.96394406, 0.97208396, 0.97515133,
         0.97586374, 0.97594976, 0.97592957, 0.97590986, 0.97590988]])}

### How well do the 2000-ish terms predict movie review sentiment?

In [123]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [133]:
auc_scores = []

for i in range(len(train_datasets)):
    model = LogisticRegression(C=best_C, penalty="l2", max_iter=100000, random_state=SEED, verbose=1)
    
    train_X = top_feature_vectorizer.fit_transform(preprocess_reviews(train_datasets[i]["review"]))
    train_y = train_datasets[i]["sentiment"]
    
    model.fit(train_X, train_y)
    
    test_X = top_feature_vectorizer.transform(preprocess_reviews(test_datasets[i]["review"]))
    test_y = test_ys[i]["sentiment"]
    
    pred_y = model.predict_proba(test_X)[:, 1]  # Predict probabilities for class 1 (positive review)
    
    auc_score = roc_auc_score(test_y, pred_y)
    auc_scores.append(auc_score)
    
    print(f"AUC of split {i+1}: {auc_score}")

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.6s finished


AUC of split 1: 0.9562021316566655


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.4s finished


AUC of split 2: 0.9564398398628675


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.7s finished


AUC of split 3: 0.9546663902205452


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.7s finished


AUC of split 4: 0.9565502633921685


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.9s finished


AUC of split 5: 0.9548797058844237


### Reduce the number of terms to under 1000

<div class="alert alert-block alert-danger">
<b>Ignore code below this line</b>
</div>

## Attempt at lemmatization

In [None]:
# Now add lemmatizer to preprocessing.
# Code taken from StackOverflow post: https://stackoverflow.com/questions/47423854/sklearn-adding-lemmatizer-to-countvectorizer

# To make this work, need to deal with punctuation. Also need to provide POS tags, like here:
# https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [None]:
lemma_vectorizer = CountVectorizer(
    lowercase=True,                   # Convert to lowercase
    tokenizer=LemmaTokenizer(),       # Lemmatization
    stop_words=stopwords,             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                     # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r"\b[\w+\|']+\b"    # Use word tokenizer
)

lemma_dtm_train = lemma_vectorizer.fit_transform(train['review'])

In [None]:
# View some of the ngrams identified
lemma_vectorizer.get_feature_names_out()

In [None]:
# View the number of ngrams
lemma_vectorizer.get_feature_names_out().shape

In [None]:
# Encode as Unicode
feature_names = [l.encode("utf-8") for l in lemma_vectorizer.get_feature_names_out()]

# Output features to file
with open("split_1/lemma_vectorizer_features.txt", "wb") as f:
    for l in feature_names:
        f.write(b'%s\n'%l)

np.savetxt("split_1/lemma_vectorizer_features.txt", lemma_vectorizer.get_feature_names_out(), fmt="%s", delimiter=",")

## Try stemming words before computing their predictive power

In [None]:
train_stem = train_datasets[3]
# Remove HTML tags
train_stem["review"] = train_stem["review"].str.replace('<.*?>', ' ', regex=True)

In [None]:
train_stem

In [None]:
# Remove all punctulation besides apostrophes.
# Regular expression from StackOverflow post:
# https://stackoverflow.com/questions/59877761/how-to-strip-string-from-punctuation-except-apostrophes-for-nlp
train_stem["review"] = train_stem["review"].str.replace('[^\w\d\s\']+', '', regex=True)

In [None]:
train_stem

In [None]:
# Try whitespace tokenizer to avoid splitting words on apostrophes
from nltk.tokenize import WhitespaceTokenizer
tk = WhitespaceTokenizer()
p_stemmer = PorterStemmer()

In [None]:
#from nltk import word_tokenize

for i in range(len(train_stem)):
    train_stem.iloc[i, 2] = " ".join([p_stemmer.stem(review_token) for review_token in tk.tokenize(train_stem.iloc[i, 2])
                                      if review_token not in stopwords])

In [None]:
train_stem

In [None]:
# PorterStemmer class to add to preprocessing.
class StemmerPorter(object):
    def __init__(self):
        self.wnl = PorterStemmer()
        self.tk = WhitespaceTokenizer()
    def __call__(self, articles):
        return [self.wnl.stem(t) for t in self.tk.tokenize(articles) if t not in stopwords]

In [None]:
stem_vectorizer = CountVectorizer(
    lowercase=True,                   # Convert to lowercase
    tokenizer=StemmerPorter(),        # Stemming
    stop_words=stopwords,             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                     # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r'[^\w\d\s\']+'     # Keep apostrophes while removing other punctuation
)

dtm_train_stem = stem_vectorizer.fit_transform(train_stem['review'])

In [None]:
# View some of the ngrams identified
stem_vectorizer.get_feature_names_out()

In [None]:
# Encode as Unicode
stem_feature_names = [l.encode("utf-8") for l in stem_vectorizer.get_feature_names_out()]

# Output features to file
with open("split_1/stem_vectorizer_features.txt", "wb") as f:
    for l in stem_feature_names:
        f.write(b'%s\n'%l)


In [None]:
# Try t-test to identify terms that are strongly associated with only positive or only negative reviews.

dtm_stem_array = dtm_train_stem.toarray()
dtm_stem_pos = dtm_stem_array[train_stem.sentiment == 1, :]
dtm_stem_neg = dtm_stem_array[train_stem.sentiment == 0, :]

In [None]:
dtm_stem_pos_count = dtm_stem_pos.shape[0]
dtm_stem_neg_count = dtm_stem_neg.shape[0]
dtm_stem_pos_count, dtm_stem_neg_count

In [None]:
dtm_stem_pos_means = np.mean(dtm_stem_pos, axis=0)
dtm_stem_pos_vars = np.var(dtm_stem_pos, axis=0, ddof=1)

In [None]:
dtm_stem_neg_means = np.mean(dtm_stem_neg, axis=0)
dtm_stem_neg_vars = np.var(dtm_stem_neg, axis=0, ddof=1)

In [None]:
# For each term / ngram, compute t-statistic for two independent samples.
# Hmmm...they're not independent, but we can't really pool the variance...

stem_t_statistics = (dtm_stem_pos_means - dtm_stem_neg_means) / np.sqrt((dtm_stem_pos_vars/dtm_stem_pos_count) + (dtm_stem_neg_vars/dtm_stem_neg_count))

In [None]:
stem_feature_ngrams = stem_vectorizer.get_feature_names_out()

In [None]:
stem_feature_statistic_df = pd.DataFrame({"feature": stem_feature_ngrams.tolist(), "statistic": stem_t_statistics.tolist()})

In [None]:
# How many terms meet the 0.05 significance threshold?

len(feature_statistic_df[abs(feature_statistic_df.statistic) >= 1.645])

In [None]:
# Look at top 50 positive ngrams
stem_feature_statistic_df.sort_values(by="statistic", ascending=False).iloc[0:50, :]

In [None]:
# Look at bottom 50 ngrams (most negative)
stem_feature_statistic_df.sort_values(by="statistic").iloc[0:50, :]

In [None]:
# Choose the 2000 most predictive tokens based on their t-statistics
n_tokens = 2000

stem_feature_statistic_df["abs_statistic"] = abs(stem_feature_statistic_df["statistic"])

In [None]:
stem_predictive_tokens = stem_feature_statistic_df.sort_values(by="abs_statistic", ascending=False).iloc[:n_tokens, 0]

In [None]:
stem_predictive_tokens

In [None]:
# Add words that only appeared in positive reviews
dtm_stem_array.shape

In [None]:
stem_feature_ngrams.shape

In [None]:
only_positive = stem_feature_ngrams[np.logical_and((dtm_stem_pos_means > 0), (dtm_stem_neg_means == 0))]
only_negative = stem_feature_ngrams[np.logical_and((dtm_stem_pos_means == 0), (dtm_stem_neg_means > 0))]

In [None]:
only_negative