# Basic EDA based on questions from the Agenda

## Load raw data

In [1]:
import pandas as pd
import os

files = ['../data/raw/' + x for x in os.listdir('../data/raw') if x[-5:] == '.json']

df = pd.read_json(files[0], lines=True)
for file in files[1:]:
    df = df.append(pd.read_json(file, lines=True), ignore_index=True)

## How many tweets are in the dataset?

In [2]:
df.drop_duplicates(subset='id', inplace=True)
len(df)

4994

## How many unique tweets are in the dataset?

In [3]:
unique_df = df.drop_duplicates(subset='text')
len(unique_df)

1478

## How many unique users?

In [4]:
user_ids = []
for user in df['user']:
    if user['id'] not in user_ids:
        user_ids.append(user['id'])

print('Including retweets: %i' % len(user_ids))

unique_user_ids = []
for user in unique_df['user']:
    if user['id'] not in unique_user_ids:
        unique_user_ids.append(user['id'])

print('Excluding retweets: %i' % len(unique_user_ids))

Including retweets: 4016
Excluding retweets: 1091


## Number of retweets
Included in the dataframe as column 'retweet_count'

## Locate the top N retweeted and liked tweets and the users that posted them

In [13]:
N = 20

n_largest = df.nlargest(N, ['retweet_count', 'favorite_count'])[['text', 'user', 'retweet_count', 'favorite_count']]
n_largest['user'] = n_largest['user'].apply(lambda x: x['name'])

n_largest

Unnamed: 0,text,user,retweet_count,favorite_count
2072,Happy that we're supporting 10K families from ...,Manu Kumar Jain,689,2378
7668,Since there's no upcoming election in Maharash...,Maharashtra Congress,223,628
7037,Which will be fact-checked First ??\n\nTMC MP ...,Rishi Bagree 🇮🇳,186,438
2997,"Dada is working silently, standing by the affe...",Initnamees 🇮🇳,159,875
2820,Bengal is currently battling with the countles...,Krishna Chandra Santra,159,178
8528,And here's the secret: UP is doing fewer than ...,Amit Schandillia,65,201
8154,"Amidst rising Covid-19 cases, migrant workers'...",MLA Sandhya Rani Tudu,58,77
2198,"On the afternoon of 20 May, Cyclone Amphan mad...",The Caravan,54,54
7149,19. NDRF/ODRAF/Fire service returned from Amph...,"I & PR Department, Odisha",47,375
8448,...streamlining of all relevant procedure espe...,HOME DEPARTMENT - GOVT. OF WEST BENGAL,42,114


## What are the locations?
## Number of tweets by location

In [37]:
places = df['place'][~df['place'].isin([None])]
places = places.reset_index()['place'].apply(lambda x: x['full_name'])

In [41]:
places.value_counts()

Kolkata, India                       10
Ichalkaranji (Hatkalangda), India     8
Anekal Bangaluru, India               7
Mumbai, India                         3
Sundarban Tigar Reserve, India        3
Chhapra, India                        2
West Bengal, India                    2
Gaya, India                           1
Panvel, India                         1
Nagaon, India                         1
Haora, India                          1
Bhubaneshwar, India                   1
Himachal Pradesh, India               1
Madhya Pradesh, India                 1
Warisaliganj, India                   1
Xiaomi India                          1
Bidhan Nagar, India                   1
Bengaluru South, India                1
Gujarat, India                        1
Surat, India                          1
Maraghat, India                       1
New Delhi, India                      1
Noida, India                          1
Name: place, dtype: int64

## What are the main languages?
## What are the counts and proportions of tweets by languages?

In [51]:
langs = df['lang'].value_counts().rename_axis('unique_values').reset_index(name='counts')
langs['proportion'] = langs['counts'].apply(lambda x: x / sum(langs['counts']))

In [52]:
langs

Unnamed: 0,unique_values,counts,proportion
0,en,4091,0.819183
1,und,704,0.140969
2,bn,123,0.02463
3,hi,31,0.006207
4,in,15,0.003004
5,tl,6,0.001201
6,mr,4,0.000801
7,es,3,0.000601
8,de,3,0.000601
9,te,3,0.000601


## To do:
- Location over time
- Source analysis
- Protected
- Verified