# Final Project
CS696 Big Data

Professor Whitney

Team:

Kristi Werry - 823386935

William Ritchie - 815829203

## Description

## Imports

In [4]:
import numpy as np
import pandas as pd
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
from pyspark.sql.types import BooleanType
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType
from pyspark.sql.types import DateType
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.functions import lit
import sys
from pyspark.sql import SparkSession

## Functions

In [21]:
def Load_Datasets(sqlContext, files, schema):
    data = sqlContext.createDataFrame([], schema=schema)
    for file in files:
        temp_data = sqlContext.read.csv(path=file[0], schema=schema, dateFormat="yy.dd.MM", timestampFormat="yyyy-MM-dd")
        temp_data = temp_data.withColumn("country", lit(file[1]))
        data = data.union(temp_data)
    return data

## Importing Datasets

In [64]:
sqlContext = SparkSession.builder.appName("FinalProjectYoutube").getOrCreate();
root_dir = "youtube-new/";

# Set up the schema for reading in the data sets into a dataframe
customSchema = StructType([
  StructField("video_id", StringType(), True),
  StructField("trending_date", DateType(), True),
  StructField("title", StringType(), True),
  StructField("channel_title", StringType(), True),
  StructField("category_id", StringType(), True),
  StructField("publish_time", TimestampType(), True),
  StructField("tags", StringType(), True),
  StructField("views", IntegerType(), True),
  StructField("likes", IntegerType(), True),
  StructField("dislikes", IntegerType(), True),
  StructField("comment_count", IntegerType(), True),
  StructField("thumbnail_link", StringType(), True),
  StructField("comments_disabled", BooleanType(), True),
  StructField("ratings_disabled", BooleanType(), True),
  StructField("video_error_or_removed", BooleanType(), True),
  StructField("description", StringType(), True),
  StructField("country", StringType(), True)
])

# Associate csv files with respective countries
data_files = [
   (root_dir + "CAvideos.csv", "Canada"),
   (root_dir + "DEvideos.csv", "Germany"),
   (root_dir + "FRvideos.csv", "France"),
   (root_dir + "GBvideos.csv", "England"),
   (root_dir + "INvideos.csv", "India"),
   (root_dir + "JPvideos.csv", "Japan"),
   (root_dir + "KRvideos.csv", "South Korea"),
   (root_dir + "MXvideos.csv", "Mexixo"),
   (root_dir + "RUvideos.csv", "Russia"),
   (root_dir + "USvideos.csv", "US"),
]

# Read in datasets
youtube_data_df = Load_Datasets(sqlContext, data_files, customSchema)

## Dataset Cleaning 

#### Handle Duplicates

In [65]:
# When dropping the duplicate rows based on video_id we found that half of the dataset gets dropped.  So we now look at the 
# duplicate rows to find out more information about what is going on.  Looking at the duplicate rows you can see that the 
# same video can be trending for mulitple days and in different countries causing the same video to exist in multiple rows.
# We decided the "duplicates" were not actually truely duplicate rows, the information provided by these multiple entries 
# is still useful.  We determined a truely duplicate row requires the same: "video_id", "views", "likes", "dislikes",
# "country", and "trending_date" column values.
pandasdf = pd.DataFrame(data=youtube_data_df.take(100000), columns=youtube_data_df.columns)

pandas_df = pandasdf.loc[pandasdf['video_id'].duplicated()]

# We realized that a number of the duplicates have a "\n" for the video id, we decided to filter those rows out since
# they contain no useful information.  We remove these rows later on when dropping NA values from the dataset.
pandas_df = pandas_df[pandas_df['video_id'] != "\\n"]

# The video id value was manually selected by viewing the resulting dataframe from the previous line. You can see
# that this video was trending for multiple days and in mulitple countries, hence why it has mulitple rows in the 
# the dataset.
pandasdf[pandasdf['video_id'] == "n1WpP7iowLc"]

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,country
1,n1WpP7iowLc,2017-11-14,Eminem - Walk On Water (Audio) ft. Beyoncé,EminemVEVO,10,2017-11-10,"""Eminem""|""Walk""|""On""|""Water""|""Aftermath/Shady/...",17158579.0,787425.0,43420.0,125882.0,https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg,False,False,False,Eminem's new track Walk on Water ft. Beyoncé i...,Canada
230,n1WpP7iowLc,2017-11-15,Eminem - Walk On Water (Audio) ft. Beyoncé,EminemVEVO,10,2017-11-10,"""Eminem""|""Walk""|""On""|""Water""|""Aftermath/Shady/...",20539417.0,840642.0,47715.0,124236.0,https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg,False,False,False,Eminem's new track Walk on Water ft. Beyoncé i...,Canada
520,n1WpP7iowLc,2017-11-16,Eminem - Walk On Water (Audio) ft. Beyoncé,EminemVEVO,10,2017-11-10,"""Eminem""|""Walk""|""On""|""Water""|""Aftermath/Shady/...",22702386.0,869304.0,50018.0,123235.0,https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg,False,False,False,Eminem's new track Walk on Water ft. Beyoncé i...,Canada
864,n1WpP7iowLc,2017-11-17,Eminem - Walk On Water (Audio) ft. Beyoncé,EminemVEVO,10,2017-11-10,"""Eminem""|""Walk""|""On""|""Water""|""Aftermath/Shady/...",24578152.0,891283.0,51978.0,125444.0,https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg,False,False,False,Eminem's new track Walk on Water ft. Beyoncé i...,Canada
45598,n1WpP7iowLc,2017-11-14,Eminem - Walk On Water (Audio) ft. Beyoncé,EminemVEVO,10,2017-11-10,"""Eminem""|""Walk""|""On""|""Water""|""Aftermath/Shady/...",17158579.0,787424.0,43420.0,125882.0,https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg,False,False,False,Eminem's new track Walk on Water ft. Beyoncé i...,Germany
92590,n1WpP7iowLc,2017-11-14,Eminem - Walk On Water (Audio) ft. Beyoncé,EminemVEVO,10,2017-11-10,"""Eminem""""|""""Walk""""|""""On""""|""""Water""""|""""Aftermat...",17158579.0,787425.0,43420.0,125882.0,https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg,False,False,False,Eminem's new track Walk on Water ft. Beyoncé i...,France


In [66]:
# The previous section determined that video_id was not sufficient in determining a truly duplicate row.  The following
# are a combination of the columns we think determine a truly duplicate row.  Meaning, if two rows have the same value
# in all of the below columns, then those two rows are indeed duplicates.
compare_duplicate_cols = ["video_id", "views", "likes", "dislikes", "country", "trending_date"]
row_count_with_dup = youtube_data_df.count()

# Drop duplicate rows
youtube_data_df = youtube_data_df.dropDuplicates(compare_duplicate_cols)

# View duplicate row count information
num_duplicates = row_count_with_dup - youtube_data_df.count()
print("Number of duplicates: " + str(num_duplicates))
print("Remaining number of rows: " + str(youtube_data_df.count()))

Number of duplicates: 48595
Remaining number of rows: 368284


#### Handle NA's and Nulls

In [67]:
# The following are a list of the columns that we determined should not contain a null or NA value.  We could have
# done this when specifying the schema when we were importing the data, but we felt it necessary to learn more information
# about columns that contain nulls and NAs.  After playing around with the data we found that many of the values in the
# description column were NA.  We decided that this was okay because some videos might not have a description, thus
# this is why description is not included in the below list
no_nan_cols = ["video_id", "trending_date", "title", 
        "channel_title", "category_id",  
        "tags", "views", "likes", "dislikes", "comment_count", 
        "thumbnail_link", "comments_disabled", "ratings_disabled", 
        "video_error_or_removed", "country"]

row_count_with_nans = youtube_data_df.count()

# Drop nans from these columns
youtube_data_df = youtube_data_df.na.drop(subset=no_nan_cols) 

# View duplicate row count information
num_nans = row_count_with_nans - youtube_data_df.count()
print("Number of null,nans, and na's: " + str(num_nans))
print("Remaining number of rows: " + str(youtube_data_df.count()))

Number of null,nans, and na's: 4923
Remaining number of rows: 363361


In [72]:
# We decided that it was advantageous to replace all of the nulls in the description column with empty strings, this
# way we do not need to check for nulls later on when working with this column in the dataset
youtube_data_df = youtube_data_df.fillna("", subset="description")

# Check that the description column contains no nulls
youtube_data_df.where(youtube_data_df.description.isNull()).count()

0