# Exploraty Data Analysis

Exploration of our data about the Drupal Community.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter


In [None]:
import time
import random
from multiprocessing import Pool

import numpy as np
import pandas as pd
from tqdm import tqdm

tqdm.pandas()


def predict_sentiment(review):
    time.sleep(0.1)  # simulating time taken by prediction model
    return "Positive" if random.randint(1, 10) > 5 else "Negative"


def batch_predict_sentiment(review_df):
    review_df["sentiment"] = review_df["comments"].progress_apply(
        predict_sentiment)
    return review_df


def fetch_sentiment_for_review():
    n_cores = 64
    reviews = pd.read_csv("data/reviews.csv")
    # split into same number of batches as n_cores
    review_batches = np.array_split(reviews, n_cores)

    # Processing Parallely
    with Pool(n_cores) as pool:
        sentiment_prediction_batches = pool.map(
            batch_predict_sentiment, review_batches)

    # Once all the batches are processed, concatenate list of DataFrames into a single DataFrame
    sentiment_prediction = pd.concat(sentiment_prediction_batches)
    return sentiment_prediction


reviews_with_sentiment = 
()


## Users data

### Understand the Data Structure

Run the following checks to get a sense of the dataset:

In [None]:
# Load the dataset
df = pd.read_parquet('../data/user.parquet')
df.shape


(2093637, 18)

In [None]:
display(
    # Get basic info on columns, data types, and missing values
    df.info(),
    # Summary statistics (only useful for numerical fields)
    df.describe(),
    # Check for missing values
    df.isnull().sum(),
)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2093637 entries, 0 to 2093636
Data columns (total 18 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   id             int32 
 1   title          string
 2   fname          string
 3   lname          string
 4   created        int32 
 5   da_membership  string
 6   slack          string
 7   mentors        object
 8   countries      object
 9   language       object
 10  languages      object
 11  timezone       string
 12  region         string
 13  city           object
 14  organizations  object
 15  industries     object
 16  contributions  object
 17  events         object
dtypes: int32(2), object(9), string(7)
memory usage: 271.5+ MB


None

Unnamed: 0,id,created
count,2093637.0,2093637.0
mean,1823387.0,1346010000.0
std,1233388.0,134703200.0
min,1.0,986039000.0
25%,641740.0,1257520000.0
50%,1758646.0,1326227000.0
75%,2929359.0,1403362000.0
max,3843802.0,1743017000.0


id                     0
title                  0
fname            1609139
lname            1620054
created                0
da_membership    2090968
slack            2090244
mentors                0
countries              0
language         2080667
languages              0
timezone         1425587
region           1426273
city             1426273
organizations          0
industries       2093637
contributions          0
events                 0
dtype: int64

### Data cleaning and preprocessing 

Since the dataset contains lists (e.g., countries, languages, mentors), we need to normalize these fields. We use `None` if the list of values are empty.

We convert timestamp to proper datetimes.

In [None]:
# Fill missing values with a default value
df = df.fillna('None')

# Replace empty arrays with None.
for col in df.columns:
    if df[col].dtype == 'O':
        df[col] = df[col].apply(lambda x: None if (
            x is None or len(x) == 0) else x)


In [None]:
users['registered_on'].dt.year.value_counts().sort_index().plot(kind="bar")
plt.title("User Registrations Per Year")


KeyError: 'registered_on'

The only numerical data we can *describe* in the registration date.

In [None]:
df['registered_on'].describe()


---

@todo Define next steps of this exploraty analysis

### Cleaning

There is a lot of empty values in this dataset.

In [None]:
users.isnull().sum()


id                     0
title                  0
fname            1609139
lname            1620054
created                0
da_membership    2090968
slack            2090244
mentors                0
countries              0
language         2080667
languages              0
timezone         1425587
region           1426273
city             1426273
organizations          0
industries       2093637
contributions          0
events                 0
dtype: int64

We can normalize empty data using `None` and get proper datetime values for the registration date.

In [None]:
# Cleaning data.
df = df.replace({pd.NA: None})

# Replace empty arrays with None.
for col in df.columns:
    if df[col].dtype == 'O':
        df[col] = df[col].apply(lambda x: None if (
            x is None or len(x) == 0) else x)


NameError: name 'pd' is not defined

In [None]:
import datetime

# Add formatted registration date.
if 'registered_on' not in df.columns:
    df['registered_on'] = df['created'].apply(
        lambda d: datetime.datetime.fromtimestamp(d))


Now let's take a look at the actual data:

In [None]:
df.head()


The only numerical data we can *describe* in the registration date.

In [None]:
df['registered_on'].describe()
