In [None]:
##Load Libraries

library(ggplot2)
library(tidyverse)
library(dplyr)
library(readr)
library(cowplot)
library(olsrr)
library(caret)
library(pscl)
library(lmtest)
library(ipred)
library(survival)
library(ResourceSelection)
library(survey)
library(lmtest)
library(pROC)
library(DescTools)

## NOTE: This is a proof of concept. Further validation work needs to take place.
##need to create train/datasets

data = read.csv('../input/trending-tv-shows-on-netflix/TV Shows - Netflix.csv', stringsAsFactors = F)
head(data) #allows you to check the data, first few entries 
summary(data) #produce result summaries of the results of various model fitting functions.
dim(data) #the dimension (e.g. the number of columns and rows) of a matrix, array or data frame. 
str(data) 
# this shows that we need to tell R which columns contain factors
# it will also show us if there are some missing values.
x<-data$Netflix
hist(x)

#Baseline Accuracy
table(x)

In [None]:
#CATOOLS
library(caTools)
set.seed(123)
split = sample.split(x, SplitRatio = 0.80) 

datatrain = subset(data, split==TRUE)
datatest = subset(data, split==FALSE)

head(datatrain)
head(datatest)

nrow(datatrain)#Training Samples
nrow(datatest) #Test Samples

In [None]:
#Logistic Regression Model
formula=Netflix~Year+Rating
QualityLog = glm(formula , data=datatrain, family=binomial)
# replace x with the dependent variable 
# replace y with the independent variable(s)

summary(QualityLog)

In [None]:
#Making predictions using the training set
predictTrain = predict(QualityLog, type="response")
summary(predictTrain)
predictTrain
y=datatrain$Netflix
tapply(predictTrain, y, mean)
#the tapply function computes the average prediction for each of the true outcomes.

#The threshold value, t, is often selected 
#based on which errors are better. 
#This would imply that t would be best 
#for no errors but it's rare to have a
#model that predicts perfectly.

In [None]:
library(ROCR)
ROCRpred = prediction(predictTrain, y)
# Performance function
ROCRperf = performance(ROCRpred, "tpr", "fpr")
# Plot ROC curve
plot(ROCRperf)
# Add colors
plot(ROCRperf, colorize=TRUE)

plot(ROCRperf, colorize=TRUE, print.cutoffs.at=seq(0,1,by=0.1), text.adj=c(-0.2,1.7))

In [None]:
# Confusion matrix for threshold of 0.5 for training set
table(y, predictTrain > 0.5)

#Making prediction on the test set
predictTest = predict(QualityLog, type = "response", newdata = datatest)
z=datatest$Netflix
cm<-table(z,predictTest >= 0.5)
cm

In [None]:
fourfoldplot(cm)

In [None]:
n = sum(cm) # number of instances
nc = nrow(cm) # number of classes
diag = diag(cm) # number of correctly classified instances per class 
rowsums = apply(cm, 1, sum) # number of instances per class
colsums = apply(cm, 2, sum) # number of predictions per class
p = rowsums / n # distribution of instances over the actual classes
q = colsums / n # distribution of instances over the predicted classes

#Compute the Accuracy
accuracy = sum(diag) / n 
accuracy

#Compute the Precision
precision = diag / colsums 
precision

#Compute the Sensitivity
recall = diag / rowsums 
recall

#Compute the F Score
f1 = 2 * precision * recall / (precision + recall) 
f1

In [None]:
#Computing the AUC
library(ROCR)
pred = prediction(predictTrain, y)
as.numeric(performance(pred, "auc")@y.values)

pred1 = prediction(predictTest, z)
as.numeric(performance(pred1, "auc")@y.values)

#Plotting the AUC
test_prob = predict(QualityLog, newdata = datatest, type = "response")
test_roc = roc(z ~ test_prob, plot = TRUE, print.auc = TRUE)

In [None]:
#Compute Pseudo R2s
PseudoR2(QualityLog, which="Efron")
PseudoR2(QualityLog, which="McFadden")
PseudoR2(QualityLog, which="CoxSnell")
PseudoR2(QualityLog, which="Nagelkerke")
PseudoR2(QualityLog, which="Tjur")

In [None]:
#Plotting Cook's Distance Plot
plot(QualityLog, which = 4, id.n = 3)

In [None]:
#Exploring Multicollinearity
model=lm(formula, data=data)
car::vif(model)

In [None]:
#Wald Test
library(car)
model=glm(formula, data=data)
Anova(model, type="II", test="Wald")