# Notebook Setup

## Package Installation

In [None]:
!pip install -i https://pypi.clarin-pl.eu lpmn_client -q

## Imports

In [None]:
import os
import re
import string
import xml.etree.ElementTree as ET
from io import StringIO
from shutil import make_archive, rmtree

import pandas as pd
from lpmn_client import Task, download_file_as_dict, upload_file
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm

In [None]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_colwidth', 120)

## Constants

In [None]:
DATA_PATH = '../scraper/citations.csv'
TEMP_DIR = 'temp_out'
TEMP_ZIP = 'temp_zip'
CLEAN_DATA_CSV = 'clean.csv'
TFIDF_CSV = 'tfidf.csv'

CLASS_LABELS = {
    'Fałsz': 0,
    'Prawda': 1,
    'Manipulacja': 2,
    'Nieweryfikowalne': 3
}

# Data Loading

In [None]:
df = pd.read_csv(DATA_PATH, sep=';', dtype={
    'content': str,
    'author': str,
    'label': str
})

df = df.dropna()
df = df.reset_index(drop=True)
df['label'].replace(CLASS_LABELS, inplace=True)
df = df[(df['label'] == CLASS_LABELS['Fałsz']) | (df['label'] == CLASS_LABELS['Prawda'])]
df

In [None]:
def df_to_file(df, field):
    if os.path.exists(TEMP_DIR):
        rmtree(TEMP_DIR, ignore_errors=True)
    os.mkdir(TEMP_DIR)
    for index, _ in df.iterrows():
        with open(f'{TEMP_DIR}/{index}.txt', 'w') as file:
            file.write(df.at[index, field])

# Feature Creation

In [None]:
def count_uppercase_letters(text):
    count = sum([1 for char in text if char.isupper()])
    return (count / (len(text) - text.count(' '))) * 100


def count_exclamation_marks(text):
    count = text.count('!')
    return (count / (len(text) - text.count(' '))) * 100


def count_question_marks(text):
    count = text.count('?')
    return (count / (len(text) - text.count(' '))) * 100


def count_quotation_marks(text):
    count = text.count('"')
    return (count / (len(text) - text.count(' '))) * 100


def count_punctuation(text):
    count = sum([1 for char in text if char in string.punctuation])
    return (count / (len(text) - text.count(' '))) * 100


def count_text_length(text):
    return len(text) - text.count(' ')

In [None]:
def execute_query(file, lpmn_query):
    task = Task(lpmn_query)
    file_id = upload_file(file)
    output_file_id = task.run(file_id)
    return download_file_as_dict(output_file_id)

In [None]:
def remove_punctuation(text):
    no_punctuation_text = ''.join(
        [char for char in text if char not in string.punctuation])
    return no_punctuation_text


def parse_sentiment(data):
    try:
        df = pd.read_csv(StringIO(data), sep=';')
        sentiment_value = sum([int(entry) for entry in df['Polarity'].values if
                               (type(entry) == str and entry.isnumeric()) or isinstance(entry, (int, float, complex))])
        positive_words = (sum([1 for entry in df['Polarity'].values if
                               (type(entry) == str and entry.isnumeric() or isinstance(entry,
                                                                                       (int, float, complex))) and int(
                                   entry) > 0]) / len(df['Polarity']))
        negative_words = (sum([1 for entry in df['Polarity'].values if
                               (type(entry) == str and entry.isnumeric() or isinstance(entry,
                                                                                       (int, float, complex))) and int(
                                   entry) < 0]) / len(df['Polarity']))
    except Exception as e:
        print(e)
        sentiment_value = 0
        positive_words = 0
        negative_words = 0

    return sentiment_value, positive_words, negative_words


def parse_lemmatization(text):
    lemmatized_text = [word.text for word in ET.fromstring(
        text).findall('chunk/sentence/tok/lex/base')]
    return ' '.join(lemmatized_text)


def tokenize(text):
    tokens = re.split('\W+', text)
    text = [word for word in tokens]
    return text


def remove_stopwords(text):
    column_names = ['idx', 'ranking', 'output_phrase', 'original_phrase', 'c-value',
                    'length', 'freq_s', 'freq_in', 'context']
    df = pd.read_csv(StringIO(text), sep='\t', names=column_names)
    return df['output_phrase'].tolist()

In [None]:
df_to_file(df, 'content')
zip_path = make_archive(TEMP_ZIP, 'zip', TEMP_DIR)

## Sentiment

In [None]:
lpmn_sentiment_query = 'any2txt|wcrft2|wsd|ccl_emo({"lang":"polish"})|ccl_emo_stats({' \
            '"lang":"polish", "split_paragraphs": false})'
sentiment_dict = execute_query(zip_path, lpmn_sentiment_query)

## Punctuation Removal

In [None]:
for index, _ in df.iterrows():
    no_punctuation_text = remove_punctuation(df.at[index, 'content'])
    with open(f'{TEMP_DIR}/{index}.txt', 'w') as file:  
        file.write(''.join(no_punctuation_text))
zip_path = make_archive(TEMP_ZIP, 'zip', TEMP_DIR)

## Lemmatization

In [None]:
lpmn_lemmatization_query = 'any2txt|wcrft2({"guesser":false, "morfeusz2":true})'
lemmatized_dict = execute_query(zip_path, lpmn_lemmatization_query)

## Clean CSV Creation

In [None]:
df['uppercase%'] = df['content'].apply(lambda x: count_uppercase_letters(x))
df['exclamation_mark%'] = df['content'].apply(lambda x: count_exclamation_marks(x))
df['question_mark%'] = df['content'].apply(lambda x: count_question_marks(x))
df['quotation_mark%'] = df['content'].apply(lambda x: count_quotation_marks(x))
df['punctuation%'] = df['content'].apply(lambda x: count_punctuation(x))
df['length'] = df['content'].apply(lambda x: count_text_length(x))

rows_list = []
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    try:
        sentiment, positive_words, negative_words = parse_sentiment(sentiment_dict[f'{index}.txt'])
        cleaned_text = parse_lemmatization(lemmatized_dict[f'{index}.txt'])
        dictionary = {
            'sentiment': sentiment,
            'positive_words%': positive_words,
            'negative_words%': negative_words,
            'clean_text': cleaned_text
        }

        rows_list.append(dictionary)
    except KeyError:
        print(f'KeyError on index: {index}')

df = pd.concat([df.reset_index(drop=True), pd.DataFrame(
    rows_list).reset_index(drop=True)], axis=1)
df.to_csv(CLEAN_DATA_CSV)
print(df)

## Cleanup

In [None]:
rmtree(TEMP_DIR, ignore_errors=True)
os.remove(zip_path)

## Stopwords Removal (Optional)

In [None]:
# df_to_file(df, 'clean_text')
# zip_path = make_archive(TEMP_ZIP, 'zip', TEMP_DIR)

In [None]:
# lpmn_query = 'any2txt|morphoDita|dir|termopl2({\"mw\":false,\"sw\":\"/resources/termopl/termopl_sw.txt\",' \
#                 '\"cp\":\"/resources/termopl/termopl_cp.txt\"}) '
# downloaded = execute_query(TEMP_ZIP, lpmn_query)
# no_stopwords = remove_stopwords(downloaded[next(iter(downloaded))])

# Vectorization

## N-gram

In [None]:
ngram_vect = CountVectorizer(ngram_range=(2, 2))
ngram = ngram_vect.fit_transform(df['content'])
ngram_df = pd.DataFrame(ngram.toarray())
ngram_df.columns = ngram_vect.get_feature_names()
ngram_df

## TF-IDF

In [None]:
clean_joined = df['clean_text']
tfidf_vect = TfidfVectorizer(ngram_range=(1, 3))
X_tfidf = tfidf_vect.fit_transform(clean_joined)
X_tfidf_feat = pd.concat([df[['uppercase%', 'exclamation_mark%', 'question_mark%', 'quotation_mark%', 'punctuation%',
                              'length', 'sentiment', 'positive_words%', 'negative_words%']].reset_index(drop=True),
                          pd.DataFrame(X_tfidf.toarray())], axis=1)
X_tfidf_feat.to_csv(TFIDF_CSV)