In [1]:
#import statements
import pandas as pd
import numpy as np
import os
import re
import time
from nltk import word_tokenize,pos_tag_sents,WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter

In [2]:
#get relative path
os.listdir(os.getcwd())

['.git',
 '.ipynb_checkpoints',
 'clean_data.csv',
 'clean_data.ipynb',
 'clean_data_Cldpoly.csv',
 'README.md',
 'train-balanced.csv',
 'train_balanced.csv']

In [3]:
#read the training data from the csv file
header = ['label','comment']
data = pd.read_table('train-balanced.csv',
                    sep='\t', 
                    delimiter=',', 
                    names=header,
                    usecols=[0,1],
                    dtype={'label':int,'comment':str},
                    keep_default_na=False)

In [4]:
#helper function to clean the comments
def comment_clean(user_comment):
    #remove the # from hashtag
    if '#' in user_comment:
        hash_tag = re.search('#',user_comment)
        if hash_tag is not None:
            user_comment = user_comment.replace(hash_tag.group(0),' ')
    #remove the redit tags(r/) from comment
    if 'r/' in user_comment:
        r_tag = re.search('r/',user_comment)
        if r_tag is not None:
            user_comment = user_comment.replace(r_tag.group(0),' ')
    #remove the URL links from comments  
    if 'HTTP' in user_comment:
        # url of the form [link name](http://url)
        url_link = re.search('\[(.*)\(HTTP(.*)\)', user_comment)
        if url_link is not None:
            user_comment = user_comment.replace(url_link.group(0),' ')
    if 'http' in user_comment:
        # url of the form [link name](http://url)
        url_link = re.search('\[(.*)\(http(.*)\)', user_comment)
        if url_link is not None:
            user_comment = user_comment.replace(url_link.group(0),' ')
        else:
            #url of the form http:/
            url_link = re.search('http(.*)', user_comment)
            if url_link is not None:
                user_comment = user_comment.replace(url_link.group(0),' ')                
    return user_comment           

In [5]:
#clean each comment
data['comment'] = data.comment.apply(comment_clean)
# remove data with empty comments
valid_comment = data['comment'] != ' '
data = data[valid_comment]
len(data)

1010783

In [6]:
start_time = time.time()
txt = data['comment'].tolist()
#POS tagging for all the tokens in the sentence
tagged_texts = pos_tag_sents(map(word_tokenize, txt))
end_time = time.time()
data['POS'] = tagged_texts
print("time taken ", end_time-start_time)

time taken  541.9575788974762


In [7]:
#helper function to collect number of interjection
def comment_interjection(user_comment):
    count = Counter(tag for word,tag in user_comment)
    return count['UH']

In [8]:
##feature extraction
# number of interjection
data['interjection']  = data.POS.apply(comment_interjection)

In [9]:
#write the cleaned data into a csv file
data.to_csv('clean_data.csv',
           sep= '|',
           index=False)