# Standard Data Preparation Techniques

## Disclaimer:
```javascript
This is a work in progress. I will be adding more techniques as I learn them. If you have any suggestions, please feel free to reach out to me.
```
## Credits:
```javascript
This work is inspired by the following previous work sources:
https://github.com/minwhoo/CrossAug

author: Minwoo Lee

citation: Minwoo Lee, Seungwon Do, and Sung Ju Hwang. 2020. CrossAug: Cross-lingual Data Augmentation for Low-resource Neural Machine Translation. In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL 2020). Association for Computational Linguistics, Online, July 5-10, 2020, pages 1-11. https://www.aclweb.org/anthology/2020.acl-main.1
```




```
---

- [ ] Data Cleaning: Identifying and correcting mistakes or errors in the data.
- [ ] Feature Selection: Identifying those input variables that are most relevant to the task.
- [ ] Data Transforms: Changing the scale or distribution of variables.
- [ ] Dimensionality Reduction: Creating compact projections of the data.
- [ ] Feature Engineering: Deriving new variables from available data.

---


### Note:
```javascript
In Text Processing, we Tokenizer,  meaning convert text into a sequence of tokens, create a numerical representation of the tokens, and assemble them into tensors. Other types of Data transformation is needed for different tasks lie Feature extractor (Audio), ImageProcessor (Images), or Processor (Multimodal).
```




In [None]:

!mkdir fever_data 
!cd fever_data & curl -L https://raw.githubusercontent.com/minwhoo/CrossAug/master/download_data.sh |sh
!curl -O https://raw.githubusercontent.com/minwhoo/CrossAug/master/utils_fever.py
!curl -O https://raw.githubusercontent.com/minwhoo/CrossAug/master/run_fever.py
!curl -O https://raw.githubusercontent.com/minwhoo/CrossAug/master/modeling_bert.py
!curl -O https://raw.githubusercontent.com/minwhoo/CrossAug/master/run_fever.py
!pip install jsonlines==2.0.0  nltk==3.6.2 numpy==1.20.2 pandas==1.1.5  scikit-learn==0.24.2  scipy==1.6.3  sentencepiece==0.1.95 tensorboardX==2.2  torch==1.8.1  transformers==4.11.2  pytorch-transformers==1.2.0  tqdm==4.60.0
!pip install pytorch-transformers

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = 'minwhoo/bart-base-negative-claim-generation'
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
examples = [
    "Little Miss Sunshine was filmed over 30 days.",
    "Magic Johnson did not play for the Lakers.",
    "Claire Danes is wedded to an actor from England."
]

batch = tokenizer(examples, max_length=1024, padding=True, truncation=True, return_tensors="pt")
out = model.generate(batch['input_ids'].to(model.device), num_beams=5)
negative_examples = tokenizer.batch_decode(out, skip_special_tokens=True)
print(negative_examples)



['Little Miss Sunshine was filmed less than 3 days.', 'Magic Johnson played for the Lakers.', 'Claire Danes is married to an actor from France.']


In [None]:
import time
import argparse

import torch
import jsonlines
from tqdm import trange, tqdm
from nltk import word_tokenize
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


def find_substitution_map(sent1, sent2):
    """Find overlapping words in the given two sentences"""
    words1 = word_tokenize(sent1)
    words2 = word_tokenize(sent2)
    start_idx = 0
    while words1[start_idx] == words2[start_idx]:
        start_idx += 1
        if start_idx == len(words1) or start_idx == len(words2):
            return None

    end_idx = -1
    while words1[end_idx] == words2[end_idx]:
        end_idx -= 1

    if end_idx == -1:
        words_overlap1 = words1[start_idx:]
        words_overlap2 = words2[start_idx:]
    else:
        words_overlap1 = words1[start_idx:end_idx+1]
        words_overlap2 = words2[start_idx:end_idx+1]

    if 0 < len(words_overlap1) <= 3 and 0 < len(words_overlap2) <= 3:
        return words_overlap1, words_overlap2
    else:
        return None


def substitute_sent(sent, orig_words, replacing_words):
    """Find and substitute word phrases from given sentence"""
    sent_words = word_tokenize(sent)
    j = 0
    match_start_idx = None
    match_end_idx = None
    matches = []
    for i in range(len(sent_words)):
        if sent_words[i] == orig_words[j]:
            if j == 0:
                match_start_idx = i
            j += 1
        else:
            j = 0
            match_start_idx = None
            match_end_idx = None
        if j == len(orig_words):
            match_end_idx = i
            matches.append((match_start_idx, match_end_idx))
            j = 0
            match_start_idx = None
            match_end_idx = None
    if len(matches) == 1:
        i, j = matches[0]
        return ' '.join(sent_words[:i] + replacing_words + sent_words[j+1:])
    else:
        return None


def generate_negative_claims(data, batch_size):
    """Generate negative (refuted) claims using fine-tuned negative claim generation model"""
    model_name = 'minwhoo/bart-base-negative-claim-generation'
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    model.to('cuda' if torch.cuda.is_available() else 'cpu')

    for i in trange(0, len(data), batch_size):
        sents = [d['claim'] for d in data[i:i+batch_size]]
        batch = tokenizer(sents, padding=True, truncation=True, return_tensors="pt")
        out = model.generate(batch['input_ids'].to(model.device), num_beams=5)
        refuted_sents = tokenizer.batch_decode(out, skip_special_tokens=True)
        for j, refuted in enumerate(refuted_sents):
            data[i + j]['claim_refuted'] = refuted
    return data


def augment_start(in_file="file.txt",out_file="file.txt",batch_size=64 ): 

    print(f"Reading from path: {in_file}")
    with jsonlines.open(in_file, mode='r') as reader:
        data = [obj for obj in reader]
    print(f"Data loaded! Data size: {len(data):,}")

    print('Generate negative claims')
    start_time = time.time()
    data = generate_negative_claims(data, batch_size)
    print(f"time took: {time.time() - start_time}")

    print('Modify evidence using lexical search-based substitution')
    failed_cnt = 0
    start_time = time.time()
    for d in tqdm(data):
        try:
            span_pair = find_substitution_map(d['claim'], d['claim_refuted'])
        except:
            failed_cnt += 1
        else:
            if span_pair is not None:
                orig_span, replace_span  = span_pair
                evid_refuted = substitute_sent(d['evidence'], orig_span, replace_span)
                if evid_refuted is not None:
                    d['evidence_refuted'] = evid_refuted
    print(f"time took: {time.time() - start_time}")

    print('Augment data')
    augmented_data = []
    for d in data:
        augmented_data.append({
            'gold_label': d['gold_label'],
            'evidence': d['evidence'],
            'claim': d['claim'],
            'id': len(augmented_data),
            'weight': 0.0,
        })
        if d['gold_label'] == 'SUPPORTS':
            augmented_data.append({
                    'gold_label': 'REFUTES',
                    'evidence': d['evidence'],
                    'claim': d['claim_refuted'],
                    'id': len(augmented_data),
                    'weight': 0.0,
                })
            if 'evidence_refuted' in d:
                augmented_data.append({
                        'gold_label': 'REFUTES',
                        'evidence': d['evidence_refuted'],
                        'claim': d['claim'],
                        'id': len(augmented_data),
                        'weight': 0.0,
                    })
                augmented_data.append({
                        'gold_label': 'SUPPORTS',
                        'evidence': d['evidence_refuted'],
                        'claim': d['claim_refuted'],
                        'id': len(augmented_data),
                        'weight': 0.0,
                    })

    print(f"Saving to path: {out_file}")
    with jsonlines.open(out_file, mode='w') as writer:
        writer.write_all(augmented_data)
    print(f"Data saved! Data size: {len(augmented_data):,}")

augment_start("fever_data/fever.train.jsonl","fever_data/fever+crossaug.train.jsonl",64)

In [175]:
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""

from __future__ import absolute_import, division, print_function

import argparse
import glob
import logging
import os
import random

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tensorboardX import SummaryWriter
from tqdm import tqdm, trange

from pytorch_transformers import (WEIGHTS_NAME, BertTokenizer,
                                  RobertaConfig,
                                  RobertaForSequenceClassification,
                                  RobertaTokenizer,
                                  XLMConfig, XLMForSequenceClassification,
                                  XLMTokenizer, XLNetConfig,
                                  XLNetForSequenceClassification,
                                  XLNetTokenizer)
from modeling_bert import BertConfig, BertForSequenceClassification

from pytorch_transformers import AdamW, WarmupLinearSchedule

In [176]:
!python3.9 run_fever.py \
    --task_name fever \
    --do_train \
    --train_task_name fever+crossaug \
    --do_eval \
    --eval_task_names fever symmetric adversarial fm2 \
    --data_dir ./fever_data/ \
    --do_lower_case \
    --model_type bert \
    --model_name_or_path bert-base-uncased \
    --max_seq_length 128 \
    --per_gpu_train_batch_size 32 \
    --learning_rate 2e-5 \
    --num_train_epochs 3.0 \
    --save_steps 100000 \
    --output_dir ./crossaug_trained_models_seed=177697310/ \
    --output_preds \
    --seed 177697310
 

04/22/2023 03:02:56 - INFO - pytorch_transformers.modeling_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /root/.cache/torch/pytorch_transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
04/22/2023 03:02:56 - INFO - pytorch_transformers.modeling_utils -   Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": "fever",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 3,
  "output_attentions": false,
  "output_hidden_states": false,
  "pad_token_id": 0,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab

In [185]:
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" BERT classification fine-tuning: utilities to work with GLUE tasks """
from sklearn.metrics import matthews_corrcoef, f1_score

logger = logging.getLogger(__name__)
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tensorboardX import SummaryWriter
from tqdm import tqdm, trange

from pytorch_transformers import (WEIGHTS_NAME, BertTokenizer,
                                  RobertaConfig,
                                  RobertaForSequenceClassification,
                                  RobertaTokenizer,
                                  XLMConfig, XLMForSequenceClassification,
                                  XLMTokenizer, XLNetConfig,
                                  XLNetForSequenceClassification,
                                  XLNetTokenizer)
from modeling_bert import BertConfig, BertForSequenceClassification

from pytorch_transformers import AdamW, WarmupLinearSchedule

from utils_fever import (compute_metrics, convert_examples_to_features,
                        output_modes, processors)

logger = logging.getLogger(__name__)

ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig)), ())

MODEL_CLASSES = {
    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
}

import argparse
import glob
import logging
import os
import random

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tensorboardX import SummaryWriter
from tqdm import tqdm, trange

from pytorch_transformers import (WEIGHTS_NAME, BertTokenizer,
                                  RobertaConfig,
                                  RobertaForSequenceClassification,
                                  RobertaTokenizer,
                                  XLMConfig, XLMForSequenceClassification,
                                  XLMTokenizer, XLNetConfig,
                                  XLNetForSequenceClassification,
                                  XLNetTokenizer)
from modeling_bert import BertConfig, BertForSequenceClassification


ImportError: ignored

In [186]:
argparse.ArgumentParser()

ArgumentParser(prog='ipykernel_launcher.py', usage=None, description=None, formatter_class=<class 'argparse.HelpFormatter'>, conflict_handler='error', add_help=True)

In [None]:
!python run_fever.py \
    --task_name fever \
    --do_train \
    --train_task_name fever \
    --do_eval \
    --eval_task_names fever symmetric adversarial fm2 \
    --data_dir ./fever_data/ \
    --do_lower_case \
    --model_type bert \
    --model_name_or_path bert-base-uncased \
    --max_seq_length 128 \
    --per_gpu_train_batch_size 32 \
    --learning_rate 2e-5 \
    --num_train_epochs 3.0 \
    --save_steps 100000 \
    --output_dir ./baseline_trained_models_seed=177697310/ \
    --output_preds \
    --seed 177697310

04/22/2023 03:06:23 - INFO - pytorch_transformers.modeling_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /root/.cache/torch/pytorch_transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
04/22/2023 03:06:23 - INFO - pytorch_transformers.modeling_utils -   Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": "fever",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 3,
  "output_attentions": false,
  "output_hidden_states": false,
  "pad_token_id": 0,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab

In [None]:
!pip install autocuda gradio 

!pip install pyabsa[dev] -U


In [None]:
!pip install --upgrade huggingface-hub -U


In [None]:
from ast import Str
import gradio as gr
from tweetnlp import Sentiment, NER
from typing import Tuple, Dict
from statistics import mean

def clean_tweet(tweet: str, remove_chars: str = "@#") -> str:
    """Remove any unwanted characters
    Args:
        tweet (str): The raw tweet
        remove_chars (str, optional): The characters to remove. Defaults to "@#".
    Returns:
        str: The tweet with these characters removed
    """
    for char in remove_chars:
        tweet = tweet.replace(char, "")
    return tweet


def format_sentiment(model_output: Dict) -> Dict:
    """Format the output of the sentiment model
    Args:
        model_output (Dict): The model output
    Returns:
        Dict: The format for gradio
    """
    formatted_output = dict()
    print(model_output)

    try:
      if model_output["label"] == "positive":
          formatted_output["positive"] = model_output["probability"]
          formatted_output["negative"] = 1 - model_output["probability"]
      else:
          formatted_output["negative"] = model_output["probability"]
          formatted_output["positive"] = 1 - model_output["probability"]
      return formatted_output
    except:
      pass


def format_entities(model_output: Dict) -> Dict:
    """Format the output of the NER model
    Args:
        model_output (Dict): The model output
    Returns:
        Dict: The format for gradio
    """
    formatted_output = dict()
    for entity in model_output["entity_prediction"]:
        new_output = dict()
        name = " ".join(entity["entity"])
        entity_type = entity["type"]
        new_key = f"{name}:{entity_type}"
        new_value = mean(entity["probability"])
        formatted_output[new_key] = new_value
    return formatted_output


def classify(tweet: str) -> Tuple[Dict, Dict]:
    """Runs models
    Args:
        tweet (str): The raw tweet
    Returns:
        Tuple[Dict, Dict]: The formatted_sentiment and formatted_entities of the tweet
    """
    tweet = clean_tweet(tweet)
    # Get sentiment
    model_sentiment = se_model.sentiment(tweet)
    model_pred = se_model.predict(tweet)
    print(model_sentiment)
    print(model_pred)
    formatted_sentiment = format_sentiment(model_sentiment)
    # Get entities
    entities = ner_model.ner(tweet)
    formatted_entities = format_entities(entities)
    return formatted_sentiment, formatted_entities

    # https://github.com/cardiffnlp/tweetnlp
    

def run(tweets=None):
  se_model = Sentiment()
  ner_model = NER()

examples = list()
examples.append("Dameon Pierce is clearly the #Texans starter and he once again looks good")
examples.append("Deebo Samuel had 150+ receiving yards in 4 games last year - the most by any receiver in the league.")

for tweet in examples:
  classify(tweet)
    # Get a few examples from: https://twitter.com/NFLFantasy





    

Dameon Pierce is clearly the Texans starter and he once again looks good
{'label': 'positive'}
Dameon Pierce is clearly the Texans starter and he once again looks good


TypeError: ignored

In [None]:
import pandas as pd


def get_sentiment():
  result = pd.DataFrame(
        {
            "aspect": result["aspect"],
            "sentiment": result["sentiment"],
            # 'probability': result[0]['probs'],
            "confidence": [round(x, 4) for x in result["confidence"]],
            "position": result["position"],
        })

  

In [None]:
pred= {'label': 'neutral', 'score': 0.7174134254455566}
pred['label'].upper()

'NEUTRAL'

In [None]:
# Murtadha Marzouq's Sentiment Analysis Project -Replication of https://doi.org/10.1016/j.ijhm.2016.02.001

import nltk
from collections import Counter
from wordcloud import WordCloud, STOPWORDS
import matplotlib
import matplotlib.pyplot as plt

from textblob import TextBlob


#DATA CLEANING

In [None]:
# Simple Text Cleaning 
# In the punctuation value to understand what is going on you will have to read up on regex commands. This is getting rid of
# punctuation, special charatcters and emojis contained in the tweets and in the text = re.sub is doing a similar task but
# taking the extra step to clean the data.
# Cleaning the tweets Step 2
def clean_tweets_tb(input):
    punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'
    text = str(input)
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub("@[A-Za-z0-9]+", "", text)
    text = re.sub(r"@[A-Za-z0-9]+", "", text)
    text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text)
    text = re.sub(r"_[A-Za-z0-9]+", "", text)
    text = re.sub(r"__", "", text)
    text = re.sub(' +', ' ', text)
    text = re.sub('[' + punctuation + ']+', ' ', text)  # strip punctuation
    text = re.sub('\s+', ' ', text)  # remove double spacing
    text = re.sub('’', ' ', text)  # remove apostrophe'
    text = re.sub('\'', ' ', text)  # remove double spacing
    text = re.sub('([0-9]+)', '', text)  # remove numbers
    text = "".join([char for char in text if char not in string.punctuation])
    text = text.lower()  # Lower text
    return text

# WORD COUNT

In [None]:
!nltk.download('stopwords')
top_N = 10

stopwords = nltk.corpus.stopwords.words('english')

RE_stopwords = r'\b(?:{})\b'.format('|'.join(stopwords))
# replace '|'-->' ' and drop all stopwords
words = (data_clean.tweet
           .str.lower()
           .replace([r'\|', RE_stopwords], [' ', ''], regex=True)
           .str.cat(sep=' ')
           .split()
)
# generate DF out of Counter
rslt = pd.DataFrame(Counter(words).most_common(top_N))
print(rslt)
# plot


wc = WordCloud(background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42).generate(str(rslt))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.subplot(3, 4, index+1)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
    
plt.show()

/bin/bash: -c: line 0: syntax error near unexpected token `'stopwords''
/bin/bash: -c: line 0: `nltk.download('stopwords')'


LookupError: ignored

# SENTIMENT ANALYSIS

In [None]:
data_sentiment = data_clean.copy()
pd.set_option('display.max_rows', 500)
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

data_sentiment['polarity'] = data_sentiment['tweet'].apply(pol)
data_sentiment['subjectivity'] = data_sentiment['tweet'].apply(sub)
data_sentiment

# Summary

In [None]:
print('Average polarity for quarantine related tweets = ',data_sentiment['polarity'].mean())
print('Average subjectivity for quarantine related tweets = ',data_sentiment['subjectivity'].mean())
print(data_sentiment.isin([0]).sum()) 
