In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', None)
from nltk.corpus import stopwords
from string import punctuation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def remove_punctuation(dirty_string):
    non_words = list(punctuation)
    for word in non_words:
        dirty_string = dirty_string.replace(word, '')
    return dirty_string

def remove_stop_words(dirty_text):
    non_words = list(punctuation)
    language_stopwords = stopwords.words('english')
    cleaned_text = ''
    for word in dirty_text.split():
        if word in language_stopwords or word in non_words:
            continue
        else:
            cleaned_text += word + ' '
    return cleaned_text

def process_file(file_content):
    file_content = file_content.lower()
    file_content = remove_punctuation(file_content)
    file_content = remove_stop_words(file_content)
    return file_content

def countTfidfScore(ppt,vid):
    ppt = pd.read_csv(ppt)
    pptClean = process_file(str(ppt['PPT Sub Text']))
    vid = pd.read_csv(vid)
    vidGroup = [vid[(vid['Video ID'] == x)]['text'] for x in vid['Video ID'].unique()]

    vidScore = []
    for v in vidGroup:
        vidClean = process_file(str(v))
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform([pptClean,vidClean])
        vidScore.append(cosine_similarity(X,X)[0,1])
    countScoreDf = pd.concat([pd.DataFrame(vid['Video ID'].unique(), columns=['Video ID']), pd.DataFrame(vidScore, columns=['Count Vector Cosine Similarity'])], axis=1)
    vidScore = []
    for v in vidGroup:
        vidClean = process_file(str(v))
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform([pptClean,vidClean])
        vidScore.append(cosine_similarity(X,X)[0,1])
    countTfidfScoreDf = pd.concat([countScoreDf, pd.DataFrame(vidScore, columns=['TFIDF Vector Cosine Similarity'])], axis=1)
    return countTfidfScoreDf

In [3]:
countTfidfScore('scraping_data/pptFlamingo.csv','scraping_data/videoFlamingo.csv')

Unnamed: 0,Video ID,Count Vector Cosine Similarity,TFIDF Vector Cosine Similarity
0,_gs4XfbYPKc,0.136224,0.076312
1,b2AN1cPn3zY,0.101354,0.05657
2,FMjppU6AKBQ,0.119681,0.068209
3,Pz3XiJac57w,0.067821,0.036898
4,eki1Q0m9aVA,0.069244,0.037751
5,KhOhTjvOy-I,0.056837,0.031057
6,1iJJX7IJcFs,0.015264,0.007811
7,Kf6kjsSgd38,0.039441,0.02055
8,QLV_K7DVeyU,0.019078,0.009774
9,e5akODQ0CVo,0.053777,0.028739


In [4]:
countTfidfScore('scraping_data/pptPeafowl.csv','scraping_data/videoPeafowl.csv')

Unnamed: 0,Video ID,Count Vector Cosine Similarity,TFIDF Vector Cosine Similarity
0,ovxJXZdhRdM,0.081555,0.043571
1,EkJvstgd7oc,0.062438,0.03297
2,9WW6YJen-xw,0.124575,0.070291
3,7MFBWmDDZoU,0.046646,0.024559
4,PIy-hw_QH1s,0.084065,0.045899
5,Xux5j8vSPJU,0.028494,0.01481
6,Jbjeyyyeddo,0.094241,0.052036
7,qdbXT9zjSkk,0.010055,0.005174
8,TTwT1-TpFhE,0.095846,0.054977
9,vKn0gsah14w,0.050555,0.026757


In [5]:
countTfidfScore('scraping_data/pptTailorBird.csv','scraping_data/videoTailorBird.csv')

Unnamed: 0,Video ID,Count Vector Cosine Similarity,TFIDF Vector Cosine Similarity
0,jF0Id-hH9y4,0.107377,0.06903
1,kzRkWvwqkNI,0.12856,0.075999
2,g9Gj8JE72O4,0.052772,0.030137
3,6svAIgEnFvw,0.056668,0.030157
4,dvPuhLC9GjU,0.087257,0.050243
5,Mu6b3u_95Ts,0.055309,0.029162
6,4WFwXZVX4bo,0.026919,0.014219
7,jLzprmI1l20,0.053547,0.029615
8,zOYalG06UB0,0.054868,0.029423
9,4nZt3t6kpxs,0.042146,0.022156


In [6]:
countTfidfScore('scraping_data/pptAnt.csv','scraping_data/videoAnt.csv')

Unnamed: 0,Video ID,Count Vector Cosine Similarity,TFIDF Vector Cosine Similarity
0,cXUCUvcscXs,0.101501,0.057059
1,QNnmjyHPnbc,0.097333,0.054754
2,kFiDThjUBTk,0.106986,0.05975
3,CASrmm4BUJk,0.096744,0.053014
4,2IVb2Atu3Jc,0.118252,0.067813
5,A_hEZNxG_H8,0.050504,0.0274
6,9SPixBok5ls,0.124753,0.071255
7,NQ-8IuUkJJc,0.049394,0.025716
8,7_e0CA_nhaE,0.040837,0.02124
9,NVT2vUQMKUc,0.053647,0.02818


In [7]:
countTfidfScore('scraping_data/pptBeetle.csv','scraping_data/videoBeetle.csv')

Unnamed: 0,Video ID,Count Vector Cosine Similarity,TFIDF Vector Cosine Similarity
0,ZZcYUQhk4R0,0.298683,0.198785
1,veY5fyt66cg,0.204266,0.12832
2,F1-PGtF81Is,0.020113,0.010411
3,nFeh9VfV0z8,0.210531,0.126651
4,DAlhbxGkanU,0.11232,0.063364
5,60N9W5uxgu8,0.380134,0.241823
6,PZtT9SLOzYU,0.31465,0.22518
7,LE-0x9AVGh8,0.116297,0.066117
8,jpQnKYXr1vo,0.106514,0.061215
9,IfaItDqFr-w,0.126166,0.077026
