# Data Analysis
Analyzing the tweet datasets, getting different kind of information and answering own questions about the data.
For this we look into the different columns of the data. While analyzing, we want to compare the train and trial dataset, do they seem to come from the same data distribution?

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
sns.set_theme()
%matplotlib inline

train = pd.read_csv('../datasets/competition/train.csv')
trial = pd.read_csv('../datasets/competition/trial.csv')
test = pd.read_csv('../datasets/competition/test.csv', usecols=[1,2,3,4])

# General info about the datasets
print(f'Train - number of rows: {len(train)}, memory size {np.round(train.memory_usage(deep=True).sum()/1000)} KB')
print(f'Trial - number of rows: {len(trial)}, memory size {np.round(trial.memory_usage(deep=True).sum()/1000)} KB')
print(f'Test - number of rows: {len(test)}, memory size {np.round(test.memory_usage(deep=True).sum()/1000)} KB')
print(f'Columns: \n{train.dtypes}')
train.head()

### Date
In what time range were the post written? How many where written per day, per month, what is the time distribution for each time period?

In [None]:
# Parse dates
train['date'] =  pd.to_datetime(train['date'], format='%Y-%m-%d %H:%M:%S+00:00')
trial['date'] =  pd.to_datetime(trial['date'], format='%Y-%m-%d %H:%M:%S+00:00')
test['date'] =  pd.to_datetime(test['date'], format='%Y-%m-%d %H:%M:%S+00:00')

# Time range of plots
print(f'Time range of train: {train["date"].min().date().strftime("%d.%b %Y")} to {train["date"].max().date().strftime("%d.%b %Y")}')
print(f'Time range of trial: {trial["date"].min().date().strftime("%d.%b %Y")} to {trial["date"].max().date().strftime("%d.%b %Y")}')
print(f'Time range of test: {test["date"].min().date().strftime("%d.%b %Y")} to {test["date"].max().date().strftime("%d.%b %Y")}')

plt.figure(figsize=(10,10))

# Plot how many posts where written per year
train_dates = sorted(map(lambda x: x.date().replace(month=1, day=1), train['date'].tolist()))
trial_dates = sorted(map(lambda x: x.date().replace(month=1, day=1), trial['date'].tolist()))
test_dates = sorted(map(lambda x: x.date().replace(month=1, day=1), test['date'].tolist()))
X_train,Y_train = np.unique(train_dates, return_counts=True)
X_trial,Y_trial = np.unique(trial_dates, return_counts=True)
X_test,Y_test = np.unique(test_dates, return_counts=True)

indices = np.argpartition(Y_train, -3)[-3:]
months = list(map(lambda x: x.strftime("%Y"), sorted(X_train[indices])))
percentage = np.round(100* (np.sum(Y_train[indices]) / 500))

print(f'The three years with the most tweets are: {months} having {percentage} % of all tweets')

plt.subplot(4, 1, 1)
plt.title('Number of tweets written each year')
plt.plot(X_train,Y_train, label='train', linewidth=1)
plt.plot(X_trial,Y_trial, label='trial', linewidth=1, linestyle='dashed')
plt.plot(X_test,Y_test, label='test', linewidth=1, linestyle='dashed')
plt.ylabel('Number of tweets')
plt.legend()

# Number of tweets per month
train_dates = sorted(map(lambda x: x.date().replace(day=1), train['date'].tolist()))
trial_dates = sorted(map(lambda x: x.date().replace(day=1), trial['date'].tolist()))
test_dates = sorted(map(lambda x: x.date().replace(day=1), test['date'].tolist()))
X_train,Y_train = np.unique(train_dates, return_counts=True)
X_trial,Y_trial = np.unique(trial_dates, return_counts=True)
X_test,Y_test = np.unique(test_dates, return_counts=True)

plt.subplot(4, 1, 2)
plt.title('Number of tweets written each month')
plt.plot(X_train,Y_train, label='train', linewidth=1)
plt.plot(X_trial,Y_trial, label='trial', linewidth=1, linestyle='dashed')
plt.plot(X_test,Y_test, label='test', linewidth=1, linestyle='dashed')
plt.ylabel('Number of tweets')
plt.legend()

indices = np.argpartition(Y_train, -4)[-4:]
months = list(map(lambda x: x.strftime("%b %Y"), sorted(X_train[indices])))
percentage = np.round(100* (np.sum(Y_train[indices]) / 500))

print(f'The four months with the most tweets are: {months} having {percentage} % of all tweets')

# Plot how many posts where written in a day
train_dates = sorted(map(lambda x: x.date(), train['date'].tolist()))
trial_dates = sorted(map(lambda x: x.date(), trial['date'].tolist()))
test_dates = sorted(map(lambda x: x.date(), test['date'].tolist()))
X_train,Y_train = np.unique(train_dates, return_counts=True)
X_trial,Y_trial = np.unique(trial_dates, return_counts=True)
X_test,Y_test = np.unique(test_dates, return_counts=True)

plt.subplot(4, 1, 3)
plt.title('Number of tweets written each day')
plt.plot(X_train,Y_train, label='train', linewidth=1)
plt.plot(X_trial,Y_trial, label='trial', linewidth=1, linestyle='dashed')
plt.plot(X_test,Y_test, label='test', linewidth=1, linestyle='dashed')
plt.ylabel('Number of tweets')
plt.legend()

indices = np.argpartition(Y_train, -5)[-5:]
months = list(map(lambda x: x.strftime("%d.%b %Y"), sorted(X_train[indices])))
percentage = np.round(100* (np.sum(Y_train[indices]) / 500))

print(f'The five days with the most tweets are: {months} having {percentage} % of all tweets')

# Plot how many posts where written at the exact time
train_dates = sorted(map(lambda x: x, train['date'].tolist()))
trial_dates = sorted(map(lambda x: x, trial['date'].tolist()))
test_dates = sorted(map(lambda x: x, test['date'].tolist()))
X_train,Y_train = np.unique(train_dates, return_counts=True)
X_trial,Y_trial = np.unique(trial_dates, return_counts=True)
X_test,Y_test = np.unique(test_dates, return_counts=True)

indices = np.argwhere(Y_train > 1).reshape(-1)
months = list(map(lambda x: x.strftime("%H:%M:%S %d.%b %Y"), sorted(X_train[indices])))
indices_trial = np.argwhere(Y_trial > 1).reshape(-1)
months_trial = list(map(lambda x: x.strftime("%H:%M:%S %d.%b %Y"), sorted(X_trial[indices_trial])))

print(f'Times, when atleast two tweets where written at the exact same moment: {months}')
print('Content of some tweets written at the exact same time:')
for x in months[:3]:
    print(list(map(lambda x: x[:70] + '..' ,train[train['date'] == datetime.strptime(x, "%H:%M:%S %d.%b %Y")]['content'].tolist())))
print(f'Number of exact time tweets in train: {len(np.argwhere(Y_train > 1))} and in trial: {len(np.argwhere(Y_trial > 1))}')

plt.subplot(4, 1, 4)
plt.title('Number of tweets written at the exact time point')
plt.plot(X_train,Y_train, label='train', linewidth=1)
plt.plot(X_trial,Y_trial, label='trial', linewidth=1, linestyle='dashed')
plt.plot(X_test,Y_test, label='test', linewidth=1, linestyle='dashed')
plt.ylabel('Number of tweets')
plt.legend()

plt.xlabel('Time')
plt.tight_layout()
plt.show()

In [None]:
# Number of tweets per label in train
pos_dates = sorted(map(lambda x: x.date().replace(day=1), train[train['label']==0]['date'].tolist()))
neg_dates = sorted(map(lambda x: x.date().replace(day=1), train[train['label']==1]['date'].tolist()))
neu_dates = sorted(map(lambda x: x.date().replace(day=1), train[train['label']==2]['date'].tolist()))
X_pos,Y_pos = np.unique(pos_dates, return_counts=True)
X_neu,Y_neu = np.unique(neu_dates, return_counts=True)
X_neg,Y_neg = np.unique(neg_dates, return_counts=True)

plt.title('Number of tweets written each month')
#plt.plot(X_neu,Y_neu, label='Other', linewidth=1)
plt.plot(X_pos,Y_pos, label='Solidary', linewidth=1)
plt.plot(X_neg,Y_neg, label='Anti-solidary', linewidth=1)
plt.ylabel('Number of tweets')
plt.legend()
plt.show()

#### Discussion
Even through the dates of the tweets alone, we can see that the tweets are likely written in the same time periods and have the same time distribution. So having the same data distribution seems also likely from this fact alone.

As can be seen most posts are written in 2021 and also each year the number is descending having less tweets, except the year 2018, where we also have nearly as much posts as 2021. However, the tweets are not evenly collected during the years, with some months, especially 4 ones, having clearly the most ones with nearly 50% of the tweets. But even within these months, the number of tweets is not evenly during the month. Dec 2018 and Nov 2021 have a somewhat more equal distribution, while Feb 2020 and Aug 2021 clearly have one maxima day, respectively.

What happened at these days? Looking at these days, I did not find anything related to islam solidarity, therefore, I consider these days to be the result of the data crawling process, unintentionally these days were preferred. Lastly we are looking at tweets that are written at the exact time. We find only very few occurrences of this happening, most of the time the datetimes are unique. Although it is not unlikely on twitter to find two tweets written at the exact same time (even at the same second!), it is very unlikely to find such tweets in a dataset designed on a very specific topic with only 500 samples available. That's why I assumed that these tweets are simply the same ones, and looking at the content we see that this is true. We count 11 and 12 of these copied tweets in train and trial by just using the time. Nevertheless, sometimes there are small changes in the content e.g. removal of a emoji in one tweet.

### Country
Next we investigate the country column of the tweets. Where do the tweets come from?

In [None]:
train_country = train.groupby('country').count().sort_values(by='date', ascending=False).drop(columns=['content', 'label', 'lang']).rename(columns={'date': 'train'})
trial_country = trial.groupby('country').count().sort_values(by='date', ascending=False).drop(columns=['content', 'lang']).rename(columns={'date': 'trial'})
test_country = test.groupby('country').count().sort_values(by='date', ascending=False).drop(columns=['content', 'lang']).rename(columns={'date': 'test'})
print(f'Top 3 countries UK US and Germany make {train_country["train"][:3].sum() / 5}% of all tweets')
pd.concat([train_country, trial_country, test_country], axis=1)

### Lang

In [None]:
train_lang = train.groupby('lang').count().sort_values(by='date', ascending=False).drop(columns=['content', 'label', 'country']).rename(columns={'date': 'train'})
trial_lang = trial.groupby('lang').count().sort_values(by='date', ascending=False).drop(columns=['content', 'country']).rename(columns={'date': 'trial'})
test_lang = test.groupby('lang').count().sort_values(by='date', ascending=False).drop(columns=['content', 'country']).rename(columns={'date': 'test'})
print(f'Tweets with lang "in": {train[train.lang == "in"]["content"].tolist()}, {trial[trial.lang == "in"]["content"].tolist()}')
pd.concat([train_lang, trial_lang, test_lang], axis=1)

### Content


In [None]:
#train_pre = pd.read_csv('../datasets/train_pre.csv')
#train_pre_lens = train_pre['content'].apply(lambda x: len(str(x))).tolist()

train_lens = train['content'].apply(lambda x: len(x)).tolist()
print(f'Longest post in train with {max(train_lens)} symbols: {train["content"].tolist()[np.argmax(train_lens)]}')
trial_lens = trial['content'].apply(lambda x: len(x)).tolist()
test_lens = test['content'].apply(lambda x: len(x)).tolist()

plt.title('Length of tweets (number of characters)')
#plt.hist(train_pre_lens, 20, label='train', linewidth=1)
plt.hist(train_lens, 20, label='train', linewidth=1)
plt.hist(trial_lens, 20, label='trial', linewidth=1, alpha=0.5)
plt.hist(test_lens, 20, label='trial', linewidth=1, alpha=0.5)
plt.ylabel('Number of tweets')
plt.legend()
plt.show()

In [None]:
# Looking at the hateful posts
train_pre = pd.read_csv('../datasets/preprocessed/train.csv')
t = train[train.label == 1]['content'].tolist()
tp = train_pre[train_pre.label == 1]['content'].tolist()
for a,b in zip(t, tp):
    print(a)
    print(b)

In [None]:
# Looking at the solidary posts
train_pre = pd.read_csv('../datasets/train_pre.csv')
t = train[train.label == 0]['content'].tolist()
tp = train_pre[train_pre.label == 0]['content'].tolist()
for a,b in zip(t, tp):
    print(a)
    print(b)

In [None]:
train[train.label == 2]['content'].tolist()

### Discussion

- Longest post has 824 symbols! but mostly users are tagged, which is not useful information

### Label

In [None]:
train_label = train.groupby('label').count().sort_values(by='date', ascending=False).drop(columns=['content', 'lang', 'country']).rename(columns={'date': 'train'})
train_label.index = train_label.index.map(lambda x: {2: 'Other', 0: 'Solidarity', 1: 'Anti-Solidarity'}[x])
train_label

## Other
Helpful scripts for debugging

In [None]:
# Check missclassification
import pandas as pd
import numpy as np

df = pd.read_csv('../datasets/preprocessed/dev.csv')
pred = np.loadtxt('labels.txt')
df['pred'] = pred
df['pred'] = df['pred'].astype('int32')
df['label'] = df['label'].astype('int32')
df[df.label != df.pred].to_csv('test_label.csv')

In [None]:
# Checking old submissions
import numpy as np
a = np.loadtxt('../old_submissions/labels0.84.txt')
b = np.loadtxt('../old_submissions/labels0.848.txt')
c = np.loadtxt('../old_submissions/labels0.856.txt')
d = np.loadtxt('../old_submissions/labels.txt')

print(f'Labels 0: {(a == 0).sum()},  1: {(a == 1).sum()},  2: {(a == 2).sum()}')
print(f'Labels 0: {(b == 0).sum()},  1: {(b == 1).sum()},  2: {(b == 2).sum()}')
print(f'Labels 0: {(c == 0).sum()},  1: {(c == 1).sum()},  2: {(c == 2).sum()}')
print(f'Labels 0: {(d == 0).sum()},  1: {(d == 1).sum()},  2: {(d == 2).sum()}')