# Text Analytics
## Assignment 2

COMPETITION TASK: 

+ Learn the classification model for training set with 5 categorical data from ['business', 'entertainment', 'politics', 'sport', 'tech'].

+ Apply learned model to get the labels for "testdata.csv"

#### Team: 
Laura Brierton - 15317451, Clodagh Lalor - student#, Jeremy Schiff - student#, Peter Concannon - student#

============================================================================================================================

### Step 1: Import packages

In [26]:
import pandas as pd
import numpy as np
import nltk, json
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [5]:
raw_trainset = pd.read_csv('trainingset.csv',sep='^',header=0)
raw_testdata = pd.read_csv('testdata.csv',sep='^',header=0)
raw_trainset.head()

Unnamed: 0,content,category
0,French boss to leave EADS The French co-head o...,business
1,"Gamers could drive high-definition TV, films, ...",tech
2,Stalemate in pension strike talks Talks aimed ...,politics
3,Johnny and Denise lose Passport Johnny Vaughan...,entertainment
4,Tautou 'to star in Da Vinci film' French actre...,entertainment


### Step 2: Extract Tokens

In [34]:
## Define the Function to convert raw text to tokens
def convert_tokens(rawtext, verbose):
    # First: Tokenization
    pattern = r'\w+'
    tokenizer = RegexpTokenizer(pattern)
    token_words = tokenizer.tokenize(rawtext)
    if (verbose):
        print('Tokens:' + str(token_words[0:10]))
    
    # # Second: Decapitalization (if needed)
    # decap_token_words = [word.lower() for word in token_words]
    # print('Decapitalized Tokens:' + str(decap_token_words[0:10]))
    
    # Third: Remove stop words
    json_data=open('stopwords.json', encoding="utf8").read()
    stopwords_json = json.loads(json_data)
    stopwords_json_en = set(stopwords_json['en'])
    stopwords_nltk_en = set(stopwords.words('english'))
    # Combine the stopwords. Its a lot longer so I'm not printing it out...
    stoplist_combined = set.union(stopwords_json_en, stopwords_nltk_en)

    rmsw_token_words = ([word for word in token_words if word.lower() not in stoplist_combined])
    if (verbose):
        print('Stopwords removed:' + str(rmsw_token_words[0:20]))
    
    ## Fouth: remove CAP words
    rmcap_token_words =[]
    for word in rmsw_token_words:
        if word.isupper():
            rmcap_token_words.append(word.title())
        else:
            rmcap_token_words.append(word)
    if (verbose):
        print('CAPITALIZED removed:' + str(rmcap_token_words[0:20]))
        
     ## Fifth : Remove salutation
    salutation = ['mr','mrs','mss','dr','phd','prof','rev', 'professor']
    rmsalu_token_words = ([word for word in rmcap_token_words if word.lower() not in salutation])
    if (verbose):
        print('Salutation removed:' + str(rmsalu_token_words[0:20]))
        
     ## Sixth: Remove Numbers
    rmnb_token_words = ([word for word in rmsalu_token_words if not word.isdigit()])
    if (verbose):
        print('Number removed: ' + str(rmnb_token_words[0:20]))
        
    ## define transfer tag function:
    def transfer_tag(treebank_tag):
        if treebank_tag.startswith('j' or 'J'):
            return 'a'
        elif treebank_tag.startswith('v' or 'V'):
            return 'v'
        elif treebank_tag.startswith('n' or 'N'):
            return 'n'
        elif treebank_tag.startswith('r' or 'R'):
            return 'r'
        else:
            # As default pos in lemmatization is Noun
            return 'n'
    
    ## Seventh: Lemmatization
    wnl = WordNetLemmatizer()

    lemma_words = []
    for word, tag in nltk.pos_tag(rmnb_token_words):
        firstletter = tag[0].lower() # -> get the first letter of tag and put them decapitalized form
        wtag = transfer_tag(firstletter) # -> extract the word's tag (noun, verb, adverb, adjective)
        if not wtag:
            lemma_words.extend([word])
        else:
            lemma_words.extend([wnl.lemmatize(word, wtag)]) # -> get lemma for word with tag
    if (verbose):
        print('Lemmas : ' + str(lemma_words[0:10]))
        
    
    ## RETURN
    return lemma_words

In [35]:
df_handle = raw_trainset.copy()
[n,d] = df_handle.shape
df_handle['Tokens'] = ['']*n

for index, row in df_handle.iterrows():
    df_handle['Tokens'].iloc[index] = convert_tokens(row['content'],0)
    
df_handle.head(10)

Unnamed: 0,content,category,Tokens
0,French boss to leave EADS The French co-head o...,business,"[French, bos, leave, Eads, French, head, Europ..."
1,"Gamers could drive high-definition TV, films, ...",tech,"[Gamers, drive, high, definition, Tv, film, ga..."
2,Stalemate in pension strike talks Talks aimed ...,politics,"[Stalemate, pension, strike, talk, Talks, aim,..."
3,Johnny and Denise lose Passport Johnny Vaughan...,entertainment,"[Johnny, Denise, lose, Passport, Johnny, Vaugh..."
4,Tautou 'to star in Da Vinci film' French actre...,entertainment,"[Tautou, star, Da, Vinci, film, French, actres..."
5,Media seek Jackson 'juror' notes Reporters cov...,entertainment,"[Media, seek, Jackson, juror, note, Reporters,..."
6,Horror film heads US box office A low-budget h...,entertainment,"[Horror, film, head, box, office, low, budget,..."
7,Kerr frustrated at victory margin Republic of ...,sport,"[Kerr, frustrate, victory, margin, Republic, I..."
8,US casino 'tricks' face ban in UK Controversia...,politics,"[casino, trick, face, ban, Uk, Controversial, ..."
9,Klinsmann issues Lehmann warning Germany coach...,sport,"[Klinsmann, issue, Lehmann, warn, Germany, coa..."
