In [3]:
# We will attempt to create a Regression model predicting 'verified_status'
# Y(Dependent) = 'verified_status'
# Potential X(Independent) = "video_duration_sec", "claim_status", "author_ban_status", "video_view_count", 
#                    "video_like_count", "video_share_count", "video_download_count", "video_comment_count", "video_transcription_text"

In [None]:
# Import packages for data manipulation
import pandas as pd
import numpy as np

# Import packages for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Import packages for data preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.utils import resample

# Import packages for data modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#Import package to download data
from IPython.display import FileLink

In [None]:
# Load dataset into dataframe
data = pd.read_csv("tiktok_dataset.csv")
file_path = 'tiktok_dataset.csv'
FileLink(file_path)

In [None]:
# Get basic information
data.describe()
data.shape
data.info()

In [None]:
# Check for missing values and duplicated then drop missing values
data.isna().sum()
data.duplicated().sum()
data = data.dropna(axis=0)

In [None]:
# Create boxplots for EDA, visualizing distribution of CONTINUOUS X(independent) variables
# x = "video_duration_sec", "video_view_count", video_like_count", "video_comment_count"
# We will ignore "video_id", this was assigned by our system.


plt.figure(figsize=(6,2))
plt.title('Boxplot to detect outliers for video_duration_sec', fontsize=12)
sns.boxplot(x=data['video_duration_sec'])
plt.show()

In [None]:
# Since outliers can strongly influence our model
# we will control outliers by reducing any extreme outlier 
# down into the quantile UPPER limit(quant3 + 1.5 *iqr)
# We will do this for both "VIDEO_LIKE_COUNT" and "VIDEO_COMMENT_COUNT"
# since they have a lot of outrageous outliers based on the boxplot visualization


quant1 = data['video_like_count'].quantile(0.25)
quant3 = data['video_like_count'].quantile(0.75)
iqr = quant3 - quant1                            # Find IQR
upper_lim = quant3 + 1.5 *iqr                    # Find Upper Limit

# Assigning upper_limit into any value that is larger than upper limit
data.loc[data["video_like_count"] > upper_lim, "video_like_count"] = upper_lim

In [None]:
# Now we will EDA and data clean our CATEGORICAL X Variables
data['verified_status'].value_counts(normalize=True) # normalize=True turn counting into percentage

In [None]:
verified_status
not verified    0.93712
verified        0.06288
Name: proportion, dtype: float64

In [None]:
# Since "verified" is SIGNIFICANTLY lower than 'not verified', we need to 
# balance it out using resample()


# Split 'verified_status' data into major and minor classes
data_major = data[data['verified_status'] == 'not verified']
data_minor = data[data['verified_status'] == 'verified']

# Upsampling minor 'verified' to be equal with major 'not verified'
data_minor_upsampled = resample(data_minor,
                                replace = True,# True = replicating original data more than ONCE is OK
                                n_samples = len(data_major),
                                random_state = 0)

# Put the splitted data back together with the newly upsampled data
data_upsampled = pd.concat([data_major,data_minor_upsampled]).reset_index(drop = True)
# Recheck if it was done correctly
data_upsampled['verified_status'].value_counts()

In [4]:
# We will try to quantify `video_transcription_text` by measuring the length of the text
data_upsampled['text_length'] = data_upsampled['video_transcription_text'].apply(lambda text: len(text))

# Comparing 2 categories in 'verified_status' for any significant difference
data_upsampled.groupby('verified_status')['text_length'].mean().reset_index()


NameError: name 'data_upsampled' is not defined

In [None]:
# Visualize the distribution of `video_transcription_text` length for videos
# posted by verified accounts and videos posted by unverified accounts
sns.histplot(data = data_upsampled,
             stat = 'count',
             multiple = 'stack',
             x = 'text_length',
             kde = False,
             hue = 'verified_status',
             element = 'bars', legend = True
            )

# Extra details
plt.xlabel("video_transcription_text length (number of characters)")
plt.ylabel("Count")
plt.title("Distribution of video_transcription_text length for videos posted by verified accounts and videos posted by unverified accounts")
plt.show()
plt.show()

In [None]:
# Code a correlation matrix to help determine most correlated variables
data_upsampled.corr(numeric_only=True)

In [None]:
# Visualize correlated variables
plt.figure(figsize = (8,6))
sns.heatmap(
    data_upsampled[["video_duration_sec", "claim_status", "author_ban_status", "video_view_count", 
                    "video_like_count", "video_share_count", "video_download_count", "video_comment_count", "text_length"]]
    .corr(numeric_only=True), 
    annot=True, 
    cmap="crest") #color palette

plt.title("Heatmap of the dataset")
plt.show()

In [None]:
# Now we have all variables set up properly. We finally can create our Logistic Model
# predicting Y using X as stated in beginning
# Separate up X and Y for the Model
y = data_upsampled['verified_status']     



X = data_upsampled[["video_duration_sec", "claim_status", "author_ban_status", "video_view_count", 
                    "video_share_count", "video_download_count", "video_comment_count"]]
# We do not to include text_length because it was stastistically insignificant
# according to heatmap

In [None]:
# Split the data FURTHER into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Check the splits
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Get unique values of categorical values `claim_status` and `author_ban_status` 
# to see what they look like
X_train["claim_status"].unique()           # array(['opinion', 'claim'], dtype=object)
X_train["author_ban_status"].unique()      # array(['active', 'under review', 'banned'], dtype=object)

In [None]:
# SETTING UP X TRAIN ##

# Converting categorical values into numeric so model can calculate them
# Taking category "claim_status", "author_ban_status" together 
# for efficient processing and removing them from the dataframe
X_train_to_encode = X_train[["claim_status", "author_ban_status"]]
X_train = X_train.drop(columns=["claim_status", "author_ban_status"])


# Using One Hot Encoding technique to encode categorical data
X_encoder = OneHotEncoder(drop='first', sparse_output=False) # Formula
X_train_encoded = X_encoder.fit_transform(X_train_to_encode) # Fitting data into formula


# Since the array generated after fit_transform is not useable by our model because they are missing column names. We need to 
# convert it back into a Dataframe and give them names.
X_train_encoded_df = pd.DataFrame(data=X_train_encoded, columns=X_encoder.get_feature_names_out())


# Concatenate `X_train` and `X_train_encoded_df` to form the final dataframe for training data (`X_train_final`)
# Note: Using `.reset_index(drop=True)` to reset the index in X_train after dropping `claim_status` and `author_ban_status`,
# so that the indices align with those in `X_train_encoded_df` and `count_df`

X_train_final = pd.concat([X_train.reset_index(drop = True),X_train_encoded_df], axis = 1)

In [None]:
# SETTING UP X TEST ##
# While we are at it, setting up X_test right after X_train for better organization
X_test_to_encode = X_test[['claim_status', 'author_ban_status']]
X_test = X_test.drop(columns=["claim_status", "author_ban_status"])

#Reuse Encoder since they both use the same formula
X_test_encoded = X_encoder.transform(X_test_to_encode)

X_test_encoded_df = pd.DataFrame(data = X_test_encoded, columns = X_encoder.get_feature_names_out())

X_test_final = pd.concat([X_test.reset_index(drop = True), X_test_encoded_df], axis = 1)

In [None]:
## SETTING UP Y TRAIN ##

# Get unique values of outcome variable
y_train.unique() #array(['verified', 'not verified'], dtype=object)

# Setting up formula for conversion
y_encoder = OneHotEncoder(drop='first', sparse_output=False)

# Adjusting the shape of `y_train` before passing into `.fit_transform()`,
# since it takes in 2D array
# .Ravel() at the end to flatten back to 1D array so it retains original form
y_train_final = y_encoder.fit_transform(y_train.values.reshape(-1,1),).ravel()

In [None]:
## SETTING UP Y TEST ##

#Reuse Encoder since they both use the same formula
y_test_final = y_encoder.fit_transform(y_test.values.reshape(-1,1),).ravel()



In [None]:
#Reconfirm sizes of all train and tests data
X_train_final.shape, y_train_final.shape, X_test_final.shape, y_test_final.shape

In [None]:
# Logistic Regression Model
# CONSTRUCT the logistic regression model using TRAINING data to get PREDICTIONs
# then
# EVALUATE by COMPARE PREDICTIONs against TEST data for accuracy

log_clf = LogisticRegression(random_state = 0, max_iter = 800).fit(X_train_final, y_train_final)

In [None]:
# EVALUATION

# Storing predictions
y_pred = log_clf.predict(X_test_final)

In [None]:
# EVALUATION Visually

log_cm = confusion_matrix(y_test_final, y_pred, labels = log_clf.classes_) # Send in data
log_disp = ConfusionMatrixDisplay(confusion_matrix = log_cm, display_labels = log_clf.classes_) # Selecting Visual Details

log_disp.plot()
plt.show()

In [None]:
# EVALUATION Statistically
accuracy = (3758+2044) / (3758 + 725 + 2044 + 2415)
accuracy

#Getting Accuracy, Precision and Recall to evaluate
target_labels = ["verified", "not verified"]
print(classification_report(y_test_final, y_pred, target_names=target_labels))

log_clf.intercept_

# Getting coefficients of all X values in predict Y('verified_status')
pd.DataFrame(data={"Feature Name":log_clf.feature_names_in_, "Model Coefficient":log_clf.coef_[0]})