In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from textblob import TextBlob
from wordcloud import WordCloud
from xml.sax import ContentHandler, parse

stemmer = SnowballStemmer('english', ignore_stopwords=True)
stop = set(stopwords.words('english'))

%matplotlib inline

In [2]:
%%time

# Reference https://goo.gl/KaOBG3
class ExcelHandler(ContentHandler):
    def __init__(self):
        self.chars = [  ]
        self.cells = [  ]
        self.rows = [  ]
        self.tables = [  ]
    def characters(self, content):
        self.chars.append(content)
    def startElement(self, name, atts):
        if name=="Cell":
            self.chars = [  ]
        elif name=="Row":
            self.cells=[  ]
        elif name=="Table":
            self.rows = [  ]
    def endElement(self, name):
        if name=="Cell":
            self.cells.append(''.join(self.chars))
        elif name=="Row":
            self.rows.append(self.cells)
        elif name=="Table":
            self.tables.append(self.rows)

excelHandler = ExcelHandler()
parse('features.xls', excelHandler)
features = pd.DataFrame(excelHandler.tables[0][1:], columns=excelHandler.tables[0][0])
# objective = 0, subjective = 1
y = np.where(features['Label'] == 'objective', 0, 1)
features['Label'] = y
features.drop(['TextID', 'URL'], axis=1, inplace=True)
features = features.astype(int)

texts = []
preprocessed_texts = []

for i in range(1, 1001):
    if i // 10 == 0:
        number = '000' + str(i)
    elif i // 100 == 0:
        number = '00' + str(i)
    elif i // 1000 == 0:
        number = '0' + str(i)
    else:
        number = '1000'
    
    f = open('Raw_data/Text' + number + '.txt', 'r', encoding='latin-1')
    text = f.read()
    preprocessed_text = ' '.join([stemmer.stem(w) for w in word_tokenize(text) if (w.isalpha() and w not in stop)])
    texts.append(text)
    preprocessed_texts.append(preprocessed_text)

data = pd.DataFrame({'texts': np.array(texts), 'preprocessed_texts': np.array(preprocessed_texts), 'label': y})
objective_texts = ' '.join(data[data['label'] == 0]['preprocessed_texts'].tolist())
subjective_texts = ' '.join(data[data['label'] == 1]['preprocessed_texts'].tolist())

Wall time: 8.2 s


### feature extraction

- `symbols` - total number of symbols in raw text
- `sentences` - total number of sentences
- `unique_words_count` - number of unique words
- `unique_words_share` - ratio between number of unique words and number of total words
- `word_average_len` - average word length in text
- `stopwords_count` - total number of stopwords
- `stopwords_share` - ratio between number of stopwords and number of total words
- `polarity_raw`, `polarity_preprocessed` - polarity in raw and preprocessed text respectively using [textblob](https://textblob.readthedocs.io/en/dev/)
- `subjectivity_raw`, `subjectivity_preprocessed` - subjectivity in raw and preprocessed text respectively using [textblob](https://textblob.readthedocs.io/en/dev/)

In [3]:
%%time

features['symbols'] = np.array([len(text) for text in texts])
features['sentences'] = np.array([text.count('\n') + 1 for text in texts])
features['unique_words_count'] = np.array([len(set(text.split())) for text in preprocessed_texts])
features['unique_words_share'] = np.array([len(set(text.split())) / len(text.split()) for text in preprocessed_texts])
features['word_average_len'] = np.array([np.array([len(w) for w in word_tokenize(text) if w.isalpha()]).mean() for text in texts])
features['stopwords_count'] = np.array([len([w for w in word_tokenize(text) if w in stop]) for text in texts])
features['stopwords_share'] = features['stopwords_count'] / features['totalWordsCount']
features['polarity_raw'] = np.array([TextBlob(text).sentiment[0] for text in texts])
features['polarity_preprocessed'] = np.array([TextBlob(text).sentiment[0] for text in preprocessed_texts])
features['subjectivity'] = np.array([TextBlob(text).sentiment[1] for text in texts])
features['subjectivity_preprocessed'] = np.array([TextBlob(text).sentiment[1] for text in preprocessed_texts])

# new data file with extracted features
features.to_csv('data.csv', index=False)

Wall time: 17.6 s
