# Import libraries

In [2]:
import pandas as pd 
import numpy as np
import scipy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz


# Preprocessing Text function

In [None]:
def preprocess_text(text):
    # Tokenize the text into individual words
    tokens = word_tokenize(text.lower())

    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens = [token.translate(table) for token in tokens if token.isalpha()]

    # Remove English stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatizer
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# Vectorizer, scaler and label encoder definition

In [3]:
vectorizer = TfidfVectorizer()
scaler = MinMaxScaler()
label_encoder = LabelEncoder()

# User

## Lettura Json

In [4]:
df = pd.read_json('/Users/raffaelerusso/Documents/GitHub/Youtube-Video-Classification-on-Twitter/Filtered_collections/user_collection.json')

## Preprocessing user description

In [4]:
df['description'] = df['description'].apply(lambda x: preprocess_text(x))

## Encoding textual fields

In [5]:
descriptions = vectorizer.fit_transform(df['description'])
print(descriptions.shape)

(301724, 154810)


## Normalize numerical features between 0 and 1

In [5]:

numerical_features = ['friends_count', 'listed_count', 'statuses_count', 'followers_count', 'favourites_count']
x_numeric = scaler.fit_transform(df[numerical_features])


## Encode categorical feature

In [6]:
categorical_feature = 'verified'
x_categoric = label_encoder.fit_transform(df[categorical_feature])
x_categoric = np.reshape(x_categoric, (-1, 1))

## Create the feature matrix

In [12]:
user_matrix = scipy.sparse.hstack((descriptions, x_numeric, x_categoric))

In [7]:
x_numeric.shape

(301724, 5)

In [8]:
#user matrix without description
user_matrix = np.hstack((x_numeric,x_categoric))

## Save the feature matrix

In [10]:
save_npz('sparse_user_matrix.npz', user_matrix)
#np.save('user_matrix_nodesc.npy', user_matrix)

# Twitter

## Lettura Json

In [10]:
df_twitter = pd.read_json('/Users/alex/Documents/GitHub/Youtube-Video-Classification-on-Twitter/VideoClassification/Filtered_collections/twitter_collection.json')

## Preprocessing text (and hashtag)

In [20]:
twitter_texts = df_twitter['text'].to_list()
preprocess_twitter_texts = [preprocess_text(x) for x in twitter_texts]

In [21]:
"""
twitter_hashtags = df_twitter['hashtag'].to_list()
preprocess_twitter_hashtags = [preprocess_text(x) for x in twitter_hashtags]
"""

## Encoding textual fields

In [None]:
preprocess_twitter_texts = vectorizer.fit_transform(preprocess_twitter_texts)
#preprocess_twitter_hashtags = vectorizer.fit_transform(preprocess_twitter_hashtags)
print(preprocess_twitter_texts.shape)

## Replacing '' values with 0

In [35]:
numerical_features_twitter = ['rt_qtd_count', 'rt_rt_count', 'rt_fav_count', 'rt_reply_count', 'qtd_qtd_count', 'qtd_fav_count', 'qtd_reply_count', 'qtd_rt_count']
for feature in numerical_features_twitter:

    df_twitter[df_twitter[feature] == ''] = 0

## Normalize numerical features between 0 and 1

In [36]:
x_numeric_twitter = scaler.fit_transform(df_twitter[numerical_features_twitter])

## Create the feature matrix

In [37]:
twitter_matrix = scipy.sparse.hstack((preprocess_twitter_texts, x_numeric_twitter))

## Save the feature matrix

In [38]:
save_npz('sparse_twitter_matrix.npz', twitter_matrix)

# YouTube

## Lettura Json

In [2]:
df_yt = pd.read_json('/Users/raffaelerusso/Documents/GitHub/Youtube-Video-Classification-on-Twitter/Filtered_collections/youtube_collection.json')

## Encoding video id

In [5]:
id_yt = np.ones(len(df_yt['id']))
id_yt = np.reshape(id_yt, (-1, 1))

## Label extraction

In [56]:
label_yt = df_yt['moderationStatus'].to_numpy()
label_yt = np.reshape(label_yt, (-1, 1))

(17123, 1)


## Saving feature and label

In [6]:
np.save('id_yt.npy', id_yt)
np.save('label_yt.npy', label_yt)