<a href="https://colab.research.google.com/github/Heity94/01_DataScience_2021/blob/main/TWSM/Class2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Functions

In [1]:
import numpy as np
import pandas as pd
import re

# function for transformation
def transform_df(text):
    
    (unique, counts) = np.unique(text.split(), return_counts=True)
    df_words = pd.DataFrame(unique, counts).reset_index().rename(columns = {"index":"counts", 0:"word"}).sort_values(by = "counts", ascending = False)
    df_words["rank"] =  df_words["counts"].rank(ascending=False)
    
    return df_words



def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        for j in sequence:
            results[i, j] = 1.
    return results

def tf_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        for j in sequence:
            results[i, j] += 1.
    return results

## Data Sets

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
TWSM_path = "/content/drive/MyDrive/Colab_Notebooks/02_HWR/00_data/SentimentDictionaries/"
bing = pd.read_csv(TWSM_path+"bing.csv")

bing

Unnamed: 0.1,Unnamed: 0,word,sentiment
0,1,2-faces,negative
1,2,abnormal,negative
2,3,abolish,negative
3,4,abominable,negative
4,5,abominably,negative
...,...,...,...
6781,6782,zealously,negative
6782,6783,zenith,positive
6783,6784,zest,positive
6784,6785,zippy,positive


## Gutenberg

In [4]:
!pip install gutenberg

from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers



In [5]:
text = strip_headers(load_etext(42671)).strip()
#text = load_etext(42671)
#print(text)

In [6]:
#TWSM_path = "/content/drive/MyDrive/teaching/TWSM/WorkInClass/"
afinn = pd.read_csv(TWSM_path+"afinn.csv")

afinn

Unnamed: 0.1,Unnamed: 0,word,value
0,1,abandon,-2
1,2,abandoned,-2
2,3,abandons,-2
3,4,abducted,-2
4,5,abduction,-2
...,...,...,...
2472,2473,yucky,-2
2473,2474,yummy,3
2474,2475,zealot,-2
2475,2476,zealots,-2


In [7]:
dic = afinn

#sentiment_merge = pd.merge(dic, df_words, how="inner", on="word")
#sentiment_merge = pd.merge(dic, df_words, left_on="word", right_on="words")
#sentiment_merge["total_value"] = sentiment_merge["value"] * sentiment_merge["counts"]
#ovr_score = np.mean(sentiment_merge["total_value"])

#print(ovr_score)

## IMD Movie Reviews


***Loading the IMDB dataset***

In [11]:
from tensorflow.keras.datasets import imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(
    num_words=10000)

**Decoding reviews back to text**

In [12]:
word_index = imdb.get_word_index()
reverse_word_index = dict(
    [(value, key) for (key, value) in word_index.items()])
decoded_review = " ".join(
    [reverse_word_index.get(i - 3, "?") for i in train_data[0]])

decoded_review

"? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you th

In [13]:
N=len(train_data)
decoded_reviews = ["" for x in range(N)]

for j in range(N):
  decoded_reviews[j] = " ".join(
    [reverse_word_index.get(i - 3, "?") for i in train_data[j]])

In [5]:
re.findall("robert", decoded_review)

NameError: ignored

### Tasks

Build a classifier on the train data in at least five different ways:

1. using sentiment analysis
2. using term frequencies in at least **4** different flavors.

The data set in 2. is VERY high-dimensional so please choose a classifier that can deal with feature selection.

### Sentiment analysis

In [12]:
#Load affin sentiment dictonary
affin = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/02_HWR/00_data/SentimentDictionaries/afinn.csv", index_col=0)
affin.head()

Unnamed: 0,word,value
1,abandon,-2
2,abandoned,-2
3,abandons,-2
4,abducted,-2
5,abduction,-2


In [13]:
# function to create term frequencie matrix
def tf(txt):
  """returns term frequency matrix as df"""
  words, cts = np.unique(txt.split(), return_counts=True)
  i = np.argsort(cts)
  df = pd.DataFrame(np.transpose([words[i[::-1]], cts[i[::-1]]]), columns=["word", "frequency"])
  df["frequency"] = df.frequency.astype("int")
  return df

# Calculate mean sentiment value per chapter
sent_reviews = []

for review in decoded_reviews:

  # create frequency matrix
  matrix = tf(review)

  #merge with affin dictonary
  sent_aff = pd.merge(matrix, affin, left_on="word", right_on="word")

  #calculate sentiment value based on no of occurences
  sent_aff["total_sentiment"] = sent_aff["frequency"]*sent_aff["value"]
  sent_reviews.append(sent_aff.total_sentiment.mean())

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
y_train = train_labels
X_train = pd.DataFrame(np.array(sent_reviews).reshape(-1, 1)).fillna(0)

In [16]:
lin_reg = LogisticRegression()
lin_reg.fit(X_train, y_train)

LogisticRegression()

In [17]:
lin_reg.score(X_train, y_train) #acc

0.72808

In [18]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, lin_reg.predict(X_train))

array([[9220, 3280],
       [3518, 8982]])

### Using term frequencies in at least 4 different flavors.
- absolute freq. of each word in each review 
- relative tf matrix
- one-hot
- "weighted" tf -> "IDF"

In [19]:
# decoded_reviews[0]
X_train = vectorize_sequences(train_data)
X_test = vectorize_sequences(test_data)

## Two-grams task in class

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

bi_gram = CountVectorizer(ngram_range = (2,2))

X_2g = bi_gram.fit_transform(decoded_reviews)

In [16]:
bigram_col=bi_gram.get_feature_names()



In [21]:
new_list = []
for i in bigram_col:
  if str.lower(i[:3])=="not":
    new_list.append(i)

In [22]:
new_list

['not 10',
 'not 100',
 'not 12',
 'not 14',
 'not 15',
 'not 1944',
 'not 1963',
 'not 1965',
 'not 1971',
 'not 1985',
 'not 1986',
 'not 1992',
 'not 2005',
 'not 2007',
 'not 24',
 'not 2nd',
 'not 30',
 'not 3rd',
 'not 40',
 'not 50',
 'not 85',
 'not ability',
 'not able',
 'not about',
 'not above',
 'not abraham',
 'not abrupt',
 'not absolute',
 'not absolutely',
 'not absurd',
 'not abuse',
 'not abysmal',
 'not academy',
 'not accept',
 'not acceptable',
 'not accepted',
 'not accepting',
 'not accidental',
 'not acclaimed',
 'not accomplish',
 'not accomplished',
 'not according',
 'not accurate',
 'not accurately',
 'not achieve',
 'not achieved',
 'not acknowledge',
 'not across',
 'not act',
 'not acted',
 'not acting',
 'not action',
 'not active',
 'not actors',
 'not acts',
 'not actual',
 'not actually',
 'not adam',
 'not adaptation',
 'not add',
 'not adding',
 'not address',
 'not addressed',
 'not adds',
 'not admiration',
 'not admitted',
 'not adolescent',
 'n

## 20-Newsgroups 

In this exercise, we will be using the 20-Newsgroups dataset. This version of the dataset contains about 11k newsgroups posts from 20 different topics.

In [None]:
# Import packages
import pandas as pd
import re
from gensim.parsing.preprocessing import STOPWORDS, strip_tags, strip_numeric, strip_punctuation, strip_multiple_whitespaces, remove_stopwords, strip_short, stem_text
import pickle
import en_core_web_sm
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

# 1. Import and examine data

In [None]:
# Import dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
df.head()

In [None]:
# Examine dataset
print('Possible tagret values:')
print(df.target_names.unique())
print(' ')
print('Class distribution:')
print(df.target_names.value_counts())

*The classes are almost uniformly distributed.*


## NLP in spacy


In [None]:
import spacy
nlp = spacy.load('en')

In [None]:
doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.")

for token in doc:
    print(token,  token.lemma_)