In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import dask.dataframe as dd
import pandas as pd
from spacy.lang.en import English
import seaborn as sns

In [None]:
pd.set_option("max_rows", 100)
pd.set_option("max_colwidth", 2000)

In [None]:
df = pd.read_csv("./data/Corona_NLP_train.csv")
df_test = pd.read_csv("./data/Corona_NLP_test.csv")
#df = dd.from_pandas(df, npartitions=1)
#df = df.persist()

# Data Exploration
Let's begin with examining a few datapoints and attributes.

In [None]:
df.head(5)

In [None]:
set(df.Sentiment)

There are some immediate observations about the tweets: 

Obviously there are hashstags, which are concatenations of words, which might not be able to be tokenized easily. Hashtags could, however be extracted as a separate feature.

Some tweets contain URL, which might not be easily processed within standard NLP pipelines.

Some tweets contain tab and newline characters ('\n' or '\r').

The location attributes seems not to correspond to physical information in a lot of cases. If this attribute was to be used, there would be a need for complex data cleaning.

In [None]:
set(df.Location)

Furthermore, the location is missing in an approximate 20% of datapoints

In [None]:
df.isna().sum()

In [None]:
sns.countplot(data=df, x="Sentiment", order=['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive'])

# Data Cleaning
Next let's identify potentially problematic patterns using regex.

In [None]:
clean_df = df.copy()

Clean up whitespace.

In [None]:
clean_df['OriginalTweet'] = clean_df['OriginalTweet'].str.replace('[\n\r]', ' ', regex=True)
clean_df['OriginalTweet'] = clean_df['OriginalTweet'].str.replace(' +', ' ', regex=True)

Remove repeated question marks.

In [None]:
clean_df['OriginalTweet'] = clean_df['OriginalTweet'].str.replace('\?+', '?', regex=True)
clean_df['OriginalTweet'] = clean_df['OriginalTweet'].str.replace('(?:\? ?)+', '?', regex=True)

Remove URLs.

In [None]:
clean_df['OriginalTweet'] = clean_df['OriginalTweet'].str.replace('http[^ ]*', ' ', regex=True)

Change ampercent sign and the xml entity to "and" word.

In [None]:
clean_df['OriginalTweet'] = clean_df['OriginalTweet'].str.replace('(&amp;)|&', 'and', regex=True)

Change hashtags to normal words (remove the '#').

'#yolo' -> 'yolo'

In [None]:
clean_df['OriginalTweet'] = clean_df['OriginalTweet'].str.replace('#', ' ', regex=True)

Remove reference to twitter Users.

Example: '@bbc'

In [None]:
clean_df['OriginalTweet'] = clean_df['OriginalTweet'].str.replace('@[^ ]+', ' ', regex=True)

In [None]:
clean_df[~clean_df['OriginalTweet'].str.contains('\w{3,}')]

In [None]:
clean_df.sample(10)