# Extrakcia čŕt

Ukážka extrakcie a spracovania čŕt na malej vzorke dát.

In [41]:
import re
import ast
import string
import pandas as pd
from copy import deepcopy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textstat.textstat import textstat
from urllib import parse, request

### Načítaj CSV

In [12]:
data = pd.read_csv('sample.csv', index_col=0)

In [13]:
data.shape

(5, 1)

In [14]:
data

Unnamed: 0,textDisplay
0,It has been explained many times - they simply...
1,+zebbedi I'm comparing your pipe dream to thos...
2,yes
3,+Maxwell Montgomery That is a very restrictive...
4,Well Tyler .. Do you ever ask yourself why bot...


### Lematizácia, odstránenie stop-slov

In [16]:
lemmatizer = WordNetLemmatizer()

comments = data['textDisplay']

new_comments = []

for idx, comment in enumerate(comments, 1):
    line = [word for word in comment.split() if word.casefold() not in (stopwords.words('english'))]
    line = ' '.join([lemmatizer.lemmatize(word) for word in line])
    new_comments.append(line)

data['textDisplayProcessed'] = pd.DataFrame(new_comments)

data.shape

(5, 2)

In [17]:
data

Unnamed: 0,textDisplay,textDisplayProcessed
0,It has been explained many times - they simply...,explained many time - simply believe Trump say...
1,+zebbedi I'm comparing your pipe dream to thos...,+zebbedi I'm comparing pipe dream lottery winn...
2,yes,yes
3,+Maxwell Montgomery That is a very restrictive...,+Maxwell Montgomery restrictive interpretation...
4,Well Tyler .. Do you ever ask yourself why bot...,Well Tyler .. ever ask party people hate them....


### Extrakcia dĺžky

In [25]:
comments = data['textDisplay']
comments_processed = data['textDisplayProcessed']

ln = []
ln_p = []

count = []
count_processed = []

for comment in zip(comments, comments_processed):
    ln.append(len(list(comment)[0]))
    ln_p.append(len(list(comment)[1]))
    count.append(len(list(comment)[0].split()))
    count_processed.append(len(list(comment)[1].split()))

data['textDisplayWordsCount'] = pd.DataFrame(count)
data['textDisplayProcessedWordsCount'] = pd.DataFrame(count_processed)
data['textDisplayLength'] = pd.DataFrame(ln)
data['textDisplayProcessedLength'] = pd.DataFrame(ln_p)

data.shape

(5, 6)

In [26]:
data

Unnamed: 0,textDisplay,textDisplayProcessed,textDisplayLength,textDisplayProcessedLength,textDisplayWordsCount,textDisplayProcessedWordsCount
0,It has been explained many times - they simply...,explained many time - simply believe Trump say...,741,479,130,71
1,+zebbedi I'm comparing your pipe dream to thos...,+zebbedi I'm comparing pipe dream lottery winn...,71,50,12,7
2,yes,yes,3,3,1,1
3,+Maxwell Montgomery That is a very restrictive...,+Maxwell Montgomery restrictive interpretation...,120,84,19,10
4,Well Tyler .. Do you ever ask yourself why bot...,Well Tyler .. ever ask party people hate them....,163,92,30,14


### Extrakcia počtu hejterských slov

In [27]:
hate = []

with open('hatewords.txt', 'r') as hate_file:
    for word in hate_file.readlines():
        word = re.sub('\n', '', word)
        hate.append(word.casefold())

exclude = set(string.punctuation)

exp = '(%s)' % '|'.join(hate)

comments = data['textDisplayProcessed']
word_count = data['textDisplayProcessedWordsCount']

iw = []
iwr = []

for idx, comment in enumerate(comments):
    line = ''.join(char for char in comment if char not in exclude)
    l = len(re.findall(exp, line.casefold()))
    iw.append(l)
    iwr.append(l / word_count[idx])

data['hatewordsCount'] = pd.DataFrame(iw)
data['hatewordsRatio'] = pd.DataFrame(iwr)

data.shape

(5, 8)

In [28]:
data

Unnamed: 0,textDisplay,textDisplayProcessed,textDisplayLength,textDisplayProcessedLength,textDisplayWordsCount,textDisplayProcessedWordsCount,hatewordsCount,hatewordsRatio
0,It has been explained many times - they simply...,explained many time - simply believe Trump say...,741,479,130,71,6,0.084507
1,+zebbedi I'm comparing your pipe dream to thos...,+zebbedi I'm comparing pipe dream lottery winn...,71,50,12,7,0,0.0
2,yes,yes,3,3,1,1,0,0.0
3,+Maxwell Montgomery That is a very restrictive...,+Maxwell Montgomery restrictive interpretation...,120,84,19,10,1,0.1
4,Well Tyler .. Do you ever ask yourself why bot...,Well Tyler .. ever ask party people hate them....,163,92,30,14,1,0.071429


### Extrakcia čitateľnosti

In [31]:
comments = data['textDisplay']

readability = []

for comment in comments:
    readability.append(textstat.flesch_kincaid_grade(comment))

data['readabilityScore'] = pd.DataFrame(readability)

data.shape

(5, 9)

In [32]:
data

Unnamed: 0,textDisplay,textDisplayProcessed,textDisplayLength,textDisplayProcessedLength,textDisplayWordsCount,textDisplayProcessedWordsCount,hatewordsCount,hatewordsRatio,readabilityScore
0,It has been explained many times - they simply...,explained many time - simply believe Trump say...,741,479,130,71,6,0.084507,6.5
1,+zebbedi I'm comparing your pipe dream to thos...,+zebbedi I'm comparing pipe dream lottery winn...,71,50,12,7,0,0.0,8.0
2,yes,yes,3,3,1,1,0,0.0,-4.6
3,+Maxwell Montgomery That is a very restrictive...,+Maxwell Montgomery restrictive interpretation...,120,84,19,10,1,0.1,13.1
4,Well Tyler .. Do you ever ask yourself why bot...,Well Tyler .. ever ask party people hate them....,163,92,30,14,1,0.071429,7.8


### Sentiment analysis

In [36]:
comments = data['textDisplay']

neg = []
neutral = []
pos = []
label = []  # neg = 0, neutral = 1, pos = 2

res = []

for idx, comment in enumerate(comments, 1):
    my_data = parse.urlencode({"language": "english", "text": comment})
    request_headers = {
        "X-Mashape-Key": "DskPmxMLhxmsh9daKprt5IgtYFB0p1KtZnwjsnnkSn82RLmw9U",
        "Content-Type": "application/x-www-form-urlencoded",
        "Accept": "application/json"
    }
    req = request.Request("https://japerk-text-processing.p.mashape.com/sentiment/",
                          data=str.encode(my_data),
                          headers=request_headers)

    u = request.urlopen(req)

    result = ast.literal_eval((u.read()).decode('ascii'))
    res.append(result)

    neg.append(result['probability']['neg'])
    neutral.append(result['probability']['neutral'])
    pos.append(result['probability']['pos'])

    if result['label'] == 'neg':
        label.append(0)
    elif result['label'] == 'neutral':
        label.append(1)
    elif result['label'] == 'pos':
        label.append(2)
    else:
        label.append(-1)

data['negativeCoefficient'] = pd.DataFrame(neg)
data['neutralCoefficient'] = pd.DataFrame(neutral)
data['positiveCoefficient'] = pd.DataFrame(pos)
data['sentimentLabel'] = pd.DataFrame(label)

data.shape

(5, 13)

In [37]:
data

Unnamed: 0,textDisplay,textDisplayProcessed,textDisplayLength,textDisplayProcessedLength,textDisplayWordsCount,textDisplayProcessedWordsCount,hatewordsCount,hatewordsRatio,readabilityScore,negativeCoefficient,neutralCoefficient,positiveCoefficient,sentimentLabel
0,It has been explained many times - they simply...,explained many time - simply believe Trump say...,741,479,130,71,6,0.084507,6.5,0.851879,0.700882,0.148121,1
1,+zebbedi I'm comparing your pipe dream to thos...,+zebbedi I'm comparing pipe dream lottery winn...,71,50,12,7,0,0.0,8.0,0.39121,0.377894,0.60879,2
2,yes,yes,3,3,1,1,0,0.0,-4.6,0.47639,0.48556,0.52361,2
3,+Maxwell Montgomery That is a very restrictive...,+Maxwell Montgomery restrictive interpretation...,120,84,19,10,1,0.1,13.1,0.309484,0.265367,0.690516,2
4,Well Tyler .. Do you ever ask yourself why bot...,Well Tyler .. ever ask party people hate them....,163,92,30,14,1,0.071429,7.8,0.473563,0.20978,0.526437,2


## Normalizácia

In [42]:
ignore_list = ['textDisplay', 'textDisplayProcessed']
data_norm = deepcopy(data)

for key in data:
    if key not in ignore_list:
        temp = []
        d_min = data[key].min()
        d_max = data[key].max()

        for x in data[key]:
            if (d_max - d_min) != 0:
                val = (x - d_min) / (d_max - d_min)
                temp.append(val)
            else:
                temp.append(0)

        data_norm[key] = pd.DataFrame(temp)

data_norm.shape

(5, 13)

In [43]:
data_norm

Unnamed: 0,textDisplay,textDisplayProcessed,textDisplayLength,textDisplayProcessedLength,textDisplayWordsCount,textDisplayProcessedWordsCount,hatewordsCount,hatewordsRatio,readabilityScore,negativeCoefficient,neutralCoefficient,positiveCoefficient,sentimentLabel
0,It has been explained many times - they simply...,explained many time - simply believe Trump say...,1.0,1.0,1.0,1.0,1.0,0.84507,0.627119,1.0,1.0,0.0,0.0
1,+zebbedi I'm comparing your pipe dream to thos...,+zebbedi I'm comparing pipe dream lottery winn...,0.092141,0.098739,0.085271,0.085714,0.0,0.0,0.711864,0.150677,0.34232,0.849323,1.0
2,yes,yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.307722,0.561554,0.692278,1.0
3,+Maxwell Montgomery That is a very restrictive...,+Maxwell Montgomery restrictive interpretation...,0.158537,0.170168,0.139535,0.128571,0.166667,1.0,1.0,0.0,0.113187,1.0,1.0
4,Well Tyler .. Do you ever ask yourself why bot...,Well Tyler .. ever ask party people hate them....,0.216802,0.186975,0.224806,0.185714,0.166667,0.714286,0.700565,0.302509,0.0,0.697491,1.0
