# Laboratory work #1 (text segmentation and annotation)

In [None]:
import pandas as pd
import re
from pathlib import Path

from tqdm import tqdm
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

The dataset reading is taken from https://www.kaggle.com/code/therealsampat/fake-news-detection.

In [None]:
df_fake = pd.read_csv('../data/Fake.csv')
df_true = pd.read_csv('../data/True.csv')

df_fake['class'] = 0
df_true['class'] = 1

In [None]:
df_fake = df_fake.drop_duplicates('text')
df_true = df_true.drop_duplicates('text')

In [None]:
df_fake.shape, df_true.shape

In [None]:
df_merge = pd.concat([df_fake, df_true], axis=0)
df_merge.sample(10)

In [None]:
df = df_merge.drop(['title', 'subject', 'date'], axis=1)
df.isnull().sum()

In [None]:
df = df.sample(frac=1)
df.reset_index(inplace=True)
df.drop(['index'], axis=1, inplace=True)

In [None]:
df['part'] = 'train'
n = len(df)
train_n = int(n * 0.8)
val_n = int(n * 0.1)
test_n = n - train_n - val_n
df.loc[(train_n < df.index) & (df.index < train_n + val_n), 'part'] = 'val'
df.loc[train_n + val_n <= df.index, 'part'] = 'test'

In [None]:
df = df.sample(frac=1)
df.reset_index(inplace=True)
df.drop(['index'], axis=1, inplace=True)

In [None]:
df.head(10)

In [None]:
len(df[df['part'] == 'train']), len(df[df['part'] == 'val']), len(df[df['part'] == 'test'])

In [None]:
df.head(10).to_csv('../data/sample.csv', index=None)

In [None]:
for index, row in df.iterrows():
    if index > 10:
        break
    print(index, row['text'], row['class'], '\n')

In [None]:
example_text = df.iloc[df[df['text'] == 'Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit '].index, 0].values[0]
print(example_text)

We need to take care about names (i.e. @jamiedupree should be treated as separate token), hash tags (#Inauguration is one token here). Also let's say that we want to keep web sites as one token (pic.twitter.com/APVtyyYote or https://t.co/1dvY5lxdKo).

In [None]:
def split_into_sentences(text):
    # so the website will not split into two separate sentences by comma:
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(?=\s|[#])')
    sentences = sentence_endings.split(text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

sentences = split_into_sentences(example_text)
for sentence in sentences:
    print(sentence)

In [None]:
def split_into_words(sentences):
    # Regular expression to match URLs, hashtags, handles, words, and standalone punctuation
    word_pattern = re.compile(r'pic.twitter.com/\S+|https?://\S+|www\.\S+|\#\S+|\@\w+|\b\w+\'?\w*|[\w\'-]+|[.,!?;]')
    tokenized_sentences = []
    for sentence in sentences:
        words = word_pattern.findall(sentence)
        tokenized_sentences.append(words)
    return tokenized_sentences

tokenized = split_into_words(sentences)
for tokens in tokenized:
    print(tokens)

In [None]:
def stem_words(tokenized_sentences, language="english"):
    stemmer = SnowballStemmer(language)
    stemmed_sentences = []
    for tokens in tokenized_sentences:
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        stemmed_sentences.append(stemmed_tokens)
    return stemmed_sentences

stemmed = stem_words(tokenized)
for s in stemmed:
    print(s)

In [None]:
def lemmatize_tokens(tokenized_sentences):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentences = []
    for tokens in tokenized_sentences:
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        lemmatized_sentences.append(lemmatized_tokens)
    return lemmatized_sentences

lemmatized = lemmatize_tokens(tokenized)
for l in lemmatized:
    print(l)

In [None]:
def process_text(text):
    sentences = split_into_sentences(text)
    tokenized = split_into_words(sentences)
    stemmed = stem_words(tokenized)
    lemmatized = lemmatize_tokens(tokenized)
    dfs = []
    for i in range(len(tokenized)):
        data = []
        for j in range(len(tokenized[i])):
            row = {
                'Token': tokenized[i][j],
                'Stem': stemmed[i][j],
                'Lemma': lemmatized[i][j]
            }
            data.append(row)
        df = pd.DataFrame(data)
        dfs.append(df)
    return dfs

In [None]:
df.head()

In [None]:
def write_dataset(df, part):
    for index, row in tqdm(df.iterrows(), total=len(df)):
        text = row['text']
        class_ = 'true' if row['class'] else 'fake'
        dir = f'../assets/annotated-corpus/{part}/{class_}'
        Path(dir).mkdir(parents=True, exist_ok=True)
        path = dir / Path(str(index) + '.tsv')

        sentence_dfs = process_text(text)
        with open(path, 'w') as f:
            for sentence_df in sentence_dfs:
                sentence_df.to_csv(f, index=None, sep='\t', header=None)
                f.write('\n')

In [None]:
%%time
for part in ['train', 'val', 'test']:
    write_dataset(df[df['part'] == part], part)