## Using all scraped data to explore the relationship between video features and views
#### 1) We will first do embedding on the title and descriptions of the videos using bad of words
#### 2) We will generate the likes to dislike ratio
#### 3) Then we will do topic modeling on the comments of each video 
#### 4) We will perform a lasso using the aboved mentioned variables and also the lenght of the video

In [0]:
# Importing all libraries
import pandas as pd
import numpy as np
import gensim
import operator
import random
from gensim import corpora, models
import re
import heapq
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from tqdm import tqdm
tqdm.pandas()
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
nltk.download('stopwords')

  from pandas import Panel
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nandinibasu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import os
os.chdir("/content/drive/Shared drives/BAX 452-422 Group/Final Project")

In [0]:
all_data = pd.read_csv('video all data.csv')

In [0]:
all_data = all_data.drop(columns= 'subtitle')

In [0]:
all_data.head(5)

Unnamed: 0,ids,titles,description,views,date,likes,dislikes,length,Comments
0,8K7XiEOx3Fw,I Tested Amazon's Best-Selling Air Fryer • Alix,Alix is testing Amazon's best selling air frye...,104887.0,09/03/20,5108.0,307.0,750.0,['4 hrs to make mash potato delete your channe...
1,ysVnhqwlDbo,16 Ways To Up Your Breakfast Toast Game • Tasty,Toast in 2020 is far superior to anything you'...,128680.0,07/03/20,3932.0,88.0,342.0,"['Pretty intense music for some toasts...', 'W..."
2,OoH1oGAJ7jI,Tasty Producers Swap Their Favorite Snacks • R...,Andrew and Rie are swapping their favorite sna...,832586.0,07/03/20,23600.0,341.0,774.0,['Andrew seems like a craving pregnant woman. ...
3,7xycoVXyFGE,Stuffed French Toast by Chef Andrea Drummer,"Inspired by bread pudding and crème brûlée, th...",105588.0,05/03/20,4421.0,73.0,214.0,['Ngl all of the mini voice cracks in her voic...
4,oUeanf1tg7U,5 Homemade Dumplings To Feast On • Tasty,The tastiest dumplings you ever did see. Shop ...,324847.0,05/03/20,8608.0,175.0,383.0,"['Rather than being gentle with the dough, the..."


### 1. Bag of Words for title

In [0]:
# change all string to lower case
all_data['titles'] = all_data['titles'].str.lower()

In [0]:
# change all number into 'NUMBER' to analysis on the effect of using numbers in title
number_title = all_data['titles'].apply(lambda x: re.sub('\d+','NUMBER',str(x)))

In [0]:
number_title.head()

0      i tested amazon's best-selling air fryer • alix
1    NUMBER ways to up your breakfast toast game • ...
2    tasty producers swap their favorite snacks • r...
3          stuffed french toast by chef andrea drummer
4        NUMBER homemade dumplings to feast on • tasty
Name: titles, dtype: object

### Title is already very streamlined, and a lot of words in all kinds of stopwords is useful for title analysis such as 'I' and 'to' etc.. So I think we can delete some stopwords after 'most common word list' is created.

In [0]:
# tokenize word
tokenized_title = number_title.apply(word_tokenize)

In [0]:
tokenized_title.head(5)

0    [i, tested, amazon, 's, best-selling, air, fry...
1    [NUMBER, ways, to, up, your, breakfast, toast,...
2    [tasty, producers, swap, their, favorite, snac...
3    [stuffed, french, toast, by, chef, andrea, dru...
4    [NUMBER, homemade, dumplings, to, feast, on, •...
Name: titles, dtype: object

In [0]:
# Handling negation, create 'not_word'
for j in range(len(tokenized_title)):
  for i in range(len(tokenized_title[j])):
    try:
      if "n't" in tokenized_title[j][i] or tokenized_title[j][i] == "not":
        tokenized_title[j].pop(i)
        tokenized_title[j][i] = 'not_' + tokenized_title[j][i]
    except:
      pass

In [0]:
ps = PorterStemmer()
stemed_title = tokenized_title.apply(lambda x: [ps.stem(i) for i in x])

In [0]:
# Create commen word vectors
word_freq = {}
for sentence in stemed_title:
    for word in sentence:
        if word not in word_freq.keys():
            word_freq[word] = 1
        else:
            word_freq[word] += 1

In [0]:
# Create word vector
word_vector = heapq.nlargest(100, word_freq, key=word_freq.get)

In [0]:
# remove 'tasty' and '•' because it shows up in all videos
word_vector.pop(0)
word_vector.pop(0)

'•'

In [0]:
# function for word vector
def word_vectors(sentence):
  sent_vect = []
  for i in word_vector:
    if i in sentence:
      sent_vect.append(1)
    else:
      sent_vect.append(0)
  return sent_vect

In [0]:
word_list = stemed_title.apply(word_vectors)

In [0]:
title_dataframe = pd.DataFrame(list(word_list),columns=word_vector)

In [0]:
title_dataframe.head()

Unnamed: 0,number,to,recip,make,a,for,how,the,your,",",...,these,are,lg,usa,s,sweet,or,rice,rie,butter
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [0]:
all_data = pd.concat([all_data, title_dataframe], axis = 1)

In [0]:
all_data = all_data.drop(columns = ['titles'])

In [0]:
all_data.head(5)

Unnamed: 0,ids,description,views,date,likes,dislikes,length,Comments,number,to,...,these,are,lg,usa,s,sweet,or,rice,rie,butter
0,8K7XiEOx3Fw,Alix is testing Amazon's best selling air frye...,104887.0,09/03/20,5108.0,307.0,750.0,['4 hrs to make mash potato delete your channe...,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ysVnhqwlDbo,Toast in 2020 is far superior to anything you'...,128680.0,07/03/20,3932.0,88.0,342.0,"['Pretty intense music for some toasts...', 'W...",1,1,...,0,0,0,0,0,0,0,0,0,0
2,OoH1oGAJ7jI,Andrew and Rie are swapping their favorite sna...,832586.0,07/03/20,23600.0,341.0,774.0,['Andrew seems like a craving pregnant woman. ...,0,0,...,0,0,0,0,0,0,0,0,1,0
3,7xycoVXyFGE,"Inspired by bread pudding and crème brûlée, th...",105588.0,05/03/20,4421.0,73.0,214.0,['Ngl all of the mini voice cracks in her voic...,0,0,...,0,0,0,0,0,0,0,0,0,0
4,oUeanf1tg7U,The tastiest dumplings you ever did see. Shop ...,324847.0,05/03/20,8608.0,175.0,383.0,"['Rather than being gentle with the dough, the...",1,1,...,0,0,0,0,0,0,0,0,0,0


### 2. Bag of Words for description

In [0]:
# change to lower case
description = all_data['description'].str.lower()

In [0]:
description = description.apply(lambda x:str(x))

In [0]:
tokenized_description = description.apply(word_tokenize)

In [0]:
stop_words_list = stopwords.words('english')
stop_words_list.remove('not')
symbol_list = ["'","/","\\", ",",".",";","$","(",")","..", "...", "?","!", ":","-","]","[", "#", "``", "''"]

In [0]:
# remove stop words
tokenized_description = tokenized_description.apply(lambda x: [i for i in x if i not in stop_words_list])

In [0]:
# handling not words
for j in range(len(tokenized_description)):
  for i in range(len(tokenized_description[j])):
    try:
      if "n't" in tokenized_description[j][i] or tokenized_description[j][i] == "not":
        tokenized_description[j].pop(i)
        tokenized_description[j][i] = 'not_' + tokenized_description[j][i]
    except:
      pass

In [0]:
# Stemming
ps = PorterStemmer()
stemmed_description = tokenized_description.apply(lambda x: [ps.stem(i) for i in x])

In [0]:
stemmed_description.head()

0    [alix, test, amazon, 's, best, sell, air, frye...
1    [toast, 2020, far, superior, anyth, 've, ever,...
2    [andrew, rie, swap, favorit, snack, ,, 's, pre...
3    [inspir, bread, pud, crème, brûlée, ,, stuf, f...
4    [tastiest, dumpl, ever, see, ., shop, new, tas...
Name: description, dtype: object

In [0]:
# Create commen word vectors
word_freq2 = {}
for sentence in stemmed_description:
    for word in sentence:
        if word not in word_freq2.keys():
            word_freq2[word] = 1
        else:
            word_freq2[word] += 1

In [0]:
len(word_freq2)

6213

Since there are only 6213 word in word_freq2 and description is not as important as title, we will use only 100 words in it.

In [0]:
word_vector2 = heapq.nlargest(100, word_freq2, key=word_freq2.get)

In [0]:
# Since the word vector contains a lot of website link (tell people to follow them), we delete some of it to decrese colinearity
word_vector_description = [
 'http',
 'food',
 'merch',
 'recip',
 'line',
 'facebook',
 'network',
 'instagram',
 'way',
 'offici',
 'everi',
 'help',
 'world',
 'thing',
 'interact',
 'lover',
 'channel',
 'youtub',
 'talent',
 'websit',
 'cookwar',
 'largest',
 'connect',
 'world-class',
 'kitchenwar',
 'food.connect',
 'new',
 'newslett',
 'shop',
 'audio',
 'musiclicens',
 'like',
 'networkhttp',
 'credit',
 'music',
 'product',
 'warner',
 'chappel',
 'check',
 'subscrib',
 'make',
 'want',
 'get',
 'delici',
 'provid',
 'audioblock',
 'tri',
 'dish',
 'favorit',
 'chicken',
 'holiday',
 'perfect',
 'easi',
 'imageshttp',
 'networksfx',
 'season',
 'one',
 'dinner',
 'chees',
 'cooki',
 'spice',
 'follow',
 'learn',
 'dessert',
 'parti',
 'next']

In [0]:
def word_vectors2(sentence):
  sent_vect = []
  for i in word_vector_description:
    if i in sentence:
      sent_vect.append(1)
    else:
      sent_vect.append(0)
  return sent_vect

In [0]:
word_list_decription = stemmed_description.apply(word_vectors2)

In [0]:
word_list_decription.head()

0    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
1    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
2    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
3    [1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
4    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
Name: description, dtype: object

In [0]:
description_columns = ['descrip_'+i for i in word_vector_description]

In [0]:
description_df = pd.DataFrame(list(word_list_decription),columns=description_columns)

In [0]:
description_df.head()

Unnamed: 0,descrip_http,descrip_food,descrip_merch,descrip_recip,descrip_line,descrip_facebook,descrip_network,descrip_instagram,descrip_way,descrip_offici,...,descrip_one,descrip_dinner,descrip_chees,descrip_cooki,descrip_spice,descrip_follow,descrip_learn,descrip_dessert,descrip_parti,descrip_next
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,1,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [0]:
all_data = pd.concat([all_data, description_df], axis = 1)

In [0]:
all_data = all_data.drop(columns = ['description'])

In [0]:
all_data.head()

Unnamed: 0,ids,views,date,likes,dislikes,length,Comments,number,to,recip,...,descrip_one,descrip_dinner,descrip_chees,descrip_cooki,descrip_spice,descrip_follow,descrip_learn,descrip_dessert,descrip_parti,descrip_next
0,8K7XiEOx3Fw,104887.0,09/03/20,5108.0,307.0,750.0,['4 hrs to make mash potato delete your channe...,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,ysVnhqwlDbo,128680.0,07/03/20,3932.0,88.0,342.0,"['Pretty intense music for some toasts...', 'W...",1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,OoH1oGAJ7jI,832586.0,07/03/20,23600.0,341.0,774.0,['Andrew seems like a craving pregnant woman. ...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,7xycoVXyFGE,105588.0,05/03/20,4421.0,73.0,214.0,['Ngl all of the mini voice cracks in her voic...,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,oUeanf1tg7U,324847.0,05/03/20,8608.0,175.0,383.0,"['Rather than being gentle with the dough, the...",1,1,0,...,0,0,0,0,0,0,0,0,0,0


### 3. Treating likes and dislikes 

In [0]:
ratio = all_data['likes']/all_data['dislikes']

In [0]:
all_data['ratio'] = ratio

In [0]:
all_data = all_data.drop(columns = ['likes', 'dislikes'])

In [0]:
all_data.head()

Unnamed: 0,ids,views,date,length,Comments,number,to,recip,make,a,...,descrip_dinner,descrip_chees,descrip_cooki,descrip_spice,descrip_follow,descrip_learn,descrip_dessert,descrip_parti,descrip_next,ratio
0,8K7XiEOx3Fw,104887.0,09/03/20,750.0,['4 hrs to make mash potato delete your channe...,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,16.638436
1,ysVnhqwlDbo,128680.0,07/03/20,342.0,"['Pretty intense music for some toasts...', 'W...",1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,44.681818
2,OoH1oGAJ7jI,832586.0,07/03/20,774.0,['Andrew seems like a craving pregnant woman. ...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,69.208211
3,7xycoVXyFGE,105588.0,05/03/20,214.0,['Ngl all of the mini voice cracks in her voic...,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,60.561644
4,oUeanf1tg7U,324847.0,05/03/20,383.0,"['Rather than being gentle with the dough, the...",1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,49.188571


### 4. Topic modeling for comment

In [0]:
comments = all_data['Comments']

In [0]:
comments = comments.apply(lambda x:str(x))

In [0]:
# revoming punctuation 
cleaned_comments = []
for j in range(len(comments)):
    text = ''
    for i in range(len(comments[j])):
        try:
            if comments[j][i] in symbol_list:
                continue
            else:
                text = text + comments[j][i]
        
        except:
            pass
    cleaned_comments.append(text)
                

In [0]:
all_data["cleaned comments"] = cleaned_comments

In [0]:
type(all_data["cleaned comments"])

pandas.core.series.Series

In [0]:
# function to remove emojis
def demojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

In [0]:
def stop_words_remove(x):
    words = ' '.join([a for a in x.split(' ') if a not in stop_words_list])
    return words

In [0]:
# removing emojis
all_data["cleaned comments"] = all_data["cleaned comments"].progress_apply(demojify)

100%|██████████| 1324/1324 [00:00<00:00, 97533.39it/s]


In [0]:
# converting to lowercase
all_data["cleaned comments"] = all_data["cleaned comments"].str.lower()

In [0]:
# removing stopwords 
all_data["cleaned comments"] = all_data["cleaned comments"].progress_apply(stop_words_remove)

100%|██████████| 1324/1324 [00:02<00:00, 472.59it/s]


In [0]:
# tokenizing
all_data["cleaned comments"] = all_data["cleaned comments"].progress_apply(word_tokenize)

100%|██████████| 1324/1324 [00:03<00:00, 407.26it/s]


In [0]:
# Handling negation, create 'not_word'
for j in range(len(all_data["cleaned comments"] )):
  for i in range(len(all_data["cleaned comments"] [j])):
    try:
      if "n't" in all_data["cleaned comments"] [j][i] or all_data["cleaned comments"] [j][i] == "not":
        all_data["cleaned comments"] [j].pop(i)
        all_data["cleaned comments"] [j][i] = 'not_' + all_data["cleaned comments"] [j][i]
    except:
      pass

In [0]:
# stemming
all_data["cleaned comments"]  = all_data["cleaned comments"] .progress_apply(lambda x: [ps.stem(i) for i in x])

100%|██████████| 1324/1324 [00:17<00:00, 73.76it/s]


In [0]:
comments = all_data["cleaned comments"] 

In [0]:
comments

0       [4, hr, make, mash, potato, delet, channel, go...
1       [pretti, intens, music, toast, learnt, today, ...
2       [andrew, seem, like, crave, pregnant, woman, s...
3       [ngl, mini, voic, crack, voic, make, want, han...
4       [rather, gentl, dough, tradit, way, make, doug...
                              ...                        
1319    [alright, time, go, make, breakfast, midnight,...
1320    [caption, say, pecan, not_pecanndeaf, peopl, o...
1321    [said, unknown, ``, when, your, earli, titl, s...
1322    [els, watch, fun, non, *watch, video, junk, fo...
1323    [351, faintli, hear, oh, ye, say, one, thing, ...
Name: cleaned comments, Length: 1324, dtype: object

# LDA Gensim Model

In [0]:
dictionary = corpora.Dictionary(comments)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow = [dictionary.doc2bow(comment) for comment in comments]

In [0]:
random.seed(100) 
lda = gensim.models.LdaMulticore(bow, 
                                 num_topics=4, 
                                 id2word=dictionary, 
                                 eval_every = 1)

In [0]:
for index, topic in lda.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(index, topic))

Topic: 0 
Words: 0.006*"rie" + 0.006*"cooki" + 0.006*"chocol" + 0.005*"cream" + 0.005*"chicken" + 0.004*"n" + 0.004*"cake" + 0.004*"chees" + 0.004*"ice" + 0.004*"bake"
Topic: 1 
Words: 0.007*"alix" + 0.007*"rie" + 0.006*"chicken" + 0.005*"chees" + 0.005*"cooki" + 0.004*"alvin" + 0.004*"vegan" + 0.004*"chocol" + 0.004*"fri" + 0.004*"dish"
Topic: 2 
Words: 0.005*"chicken" + 0.004*"sauc" + 0.004*"year" + 0.004*"cream" + 0.004*"chees" + 0.004*"bread" + 0.004*"egg" + 0.004*"n" + 0.004*"dish" + 0.003*"friend"
Topic: 3 
Words: 0.014*"rie" + 0.009*"chees" + 0.006*"cake" + 0.006*"chicken" + 0.004*"chef" + 0.004*"alvin" + 0.003*"said" + 0.003*"n" + 0.003*"year" + 0.003*"favorit"


# Feature extraction

In [0]:
lda.print_topics(4,num_words=10)


[(0,
  '0.006*"rie" + 0.006*"cooki" + 0.006*"chocol" + 0.005*"cream" + 0.005*"chicken" + 0.004*"n" + 0.004*"cake" + 0.004*"chees" + 0.004*"ice" + 0.004*"bake"'),
 (1,
  '0.007*"alix" + 0.007*"rie" + 0.006*"chicken" + 0.005*"chees" + 0.005*"cooki" + 0.004*"alvin" + 0.004*"vegan" + 0.004*"chocol" + 0.004*"fri" + 0.004*"dish"'),
 (2,
  '0.005*"chicken" + 0.004*"sauc" + 0.004*"year" + 0.004*"cream" + 0.004*"chees" + 0.004*"bread" + 0.004*"egg" + 0.004*"n" + 0.004*"dish" + 0.003*"friend"'),
 (3,
  '0.014*"rie" + 0.009*"chees" + 0.006*"cake" + 0.006*"chicken" + 0.004*"chef" + 0.004*"alvin" + 0.003*"said" + 0.003*"n" + 0.003*"year" + 0.003*"favorit"')]

In [0]:
train_vecs = []
for i in range(len(all_data)):
    top_topics = lda.get_document_topics(bow[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(4)]
    train_vecs.append(topic_vec)

In [0]:
topic_dis = pd.DataFrame(train_vecs)
topic_dis.head()

Unnamed: 0,0,1,2,3
0,0.20447,0.62151,0.173062,0.000957
1,0.000633,0.001032,0.997703,0.000632
2,0.01321,0.723245,0.000443,0.263102
3,0.00088,0.000761,0.991751,0.006609
4,0.02603,0.693203,0.276989,0.003779


In [0]:
all_data = pd.concat([all_data, topic_dis], axis = 1)
len(all_data)

1324

In [0]:
all_data = all_data.drop(columns = ['Comments', 'cleaned comments'])

In [0]:
len(all_data)

1324

In [0]:
all_data.head()

Unnamed: 0,ids,views,date,length,number,to,recip,make,a,for,...,descrip_follow,descrip_learn,descrip_dessert,descrip_parti,descrip_next,ratio,0,1,2,3
0,8K7XiEOx3Fw,104887.0,09/03/20,750.0,0,0,0,0,0,0,...,1,0,0,0,0,16.638436,0.20447,0.62151,0.173062,0.000957
1,ysVnhqwlDbo,128680.0,07/03/20,342.0,1,1,0,0,0,0,...,0,0,0,0,0,44.681818,0.000633,0.001032,0.997703,0.000632
2,OoH1oGAJ7jI,832586.0,07/03/20,774.0,0,0,0,0,0,0,...,0,0,0,0,0,69.208211,0.01321,0.723245,0.000443,0.263102
3,7xycoVXyFGE,105588.0,05/03/20,214.0,0,0,0,0,0,0,...,1,0,0,0,0,60.561644,0.00088,0.000761,0.991751,0.006609
4,oUeanf1tg7U,324847.0,05/03/20,383.0,1,1,0,0,0,0,...,0,0,0,0,0,49.188571,0.02603,0.693203,0.276989,0.003779


### 5. Lasso regression

In [0]:
# preparing data for lasso
all_data = all_data.drop(columns = ['date', 'ids'])

In [0]:
# dropping any NA values
final_data = all_data.dropna()
final_data.head()

Unnamed: 0,views,length,number,to,recip,make,a,for,how,the,...,descrip_follow,descrip_learn,descrip_dessert,descrip_parti,descrip_next,ratio,0,1,2,3
0,104887.0,750.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,16.638436,0.20447,0.62151,0.173062,0.000957
1,128680.0,342.0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,44.681818,0.000633,0.001032,0.997703,0.000632
2,832586.0,774.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,69.208211,0.01321,0.723245,0.000443,0.263102
3,105588.0,214.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,60.561644,0.00088,0.000761,0.991751,0.006609
4,324847.0,383.0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,49.188571,0.02603,0.693203,0.276989,0.003779


In [0]:
final_data['views'] = np.log(final_data['views'])
final_data['length'] = np.log(final_data['length'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [0]:
final_data.head()

Unnamed: 0,views,length,number,to,recip,make,a,for,how,the,...,descrip_follow,descrip_learn,descrip_dessert,descrip_parti,descrip_next,ratio,0,1,2,3
0,2.447606,6.620073,0,0,0,0,0,0,0,0,...,1,0,0,0,0,16.638436,0.20447,0.62151,0.173062,0.000957
1,2.465136,5.834811,1,1,0,0,0,0,0,0,...,0,0,0,0,0,44.681818,0.000633,0.001032,0.997703,0.000632
2,2.612441,6.651572,0,0,0,0,0,0,0,0,...,0,0,0,0,0,69.208211,0.01321,0.723245,0.000443,0.263102
3,2.448182,5.365976,0,0,0,0,0,0,0,0,...,1,0,0,0,0,60.561644,0.00088,0.000761,0.991751,0.006609
4,2.540902,5.948035,1,1,0,0,0,0,0,0,...,0,0,0,0,0,49.188571,0.02603,0.693203,0.276989,0.003779


In [0]:
X = final_data.drop('views', 1)
Y = final_data['views']

In [0]:
X_test,X_train,y_test,y_train=train_test_split(X,Y, test_size=0.9, random_state=31)

In [0]:
lasso = LassoCV(cv=5, random_state=0)
lasso.fit(X_train, y_train)

LassoCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=0, selection='cyclic',
        tol=0.0001, verbose=False)

In [0]:
Y_pred = lasso.predict(X_test)

In [0]:
train_score=lasso.score(X_train,y_train)
test_score=lasso.score(X_test,y_test)
print(train_score)
print(test_score)

0.6113550828740983
0.5882027977530069


In [0]:
coeff_used = np.sum(lasso.coef_!=0)
print(coeff_used)

57


In [0]:
lasso.coef_!=0

array([ True, False,  True,  True,  True, False, False, False, False,
        True, False,  True, False,  True, False,  True,  True,  True,
        True,  True, False,  True,  True,  True, False,  True, False,
       False,  True,  True, False, False, False, False,  True, False,
       False, False, False, False, False, False, False,  True, False,
       False,  True,  True, False,  True, False,  True,  True,  True,
       False,  True, False, False,  True, False, False,  True,  True,
       False,  True,  True, False,  True, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False,  True,  True, False,  True, False, False, False, False,
       False, False,  True,  True, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True,

In [0]:
lasso.coef_

array([ 6.21948972e-02,  0.00000000e+00, -7.03196129e-03, -7.38927017e-04,
        1.02131407e-02,  0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -3.85772319e-03, -0.00000000e+00, -8.03950033e-03,
        0.00000000e+00,  3.02015043e-02, -0.00000000e+00, -2.72715922e-02,
       -4.52291326e-03, -5.70046081e-03,  6.17205284e-03,  1.38676026e-03,
       -0.00000000e+00,  4.59626494e-03,  5.29089073e-02, -8.13652362e-03,
       -0.00000000e+00,  5.01801777e-03,  0.00000000e+00,  0.00000000e+00,
        3.80044674e-02,  3.44151570e-03, -0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  2.16504442e-03,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00, -4.02728389e-03,
        0.00000000e+00,  0.00000000e+00, -3.65332893e-02,  4.23422616e-02,
        0.00000000e+00, -1.61437147e-02, -0.00000000e+00,  3.05901585e-03,
        3.84240699e-02,  

In [0]:
lasso_list = list(final_data.columns)
lasso_list.pop(0)

'views'

In [0]:
# create a dictionary with feature and coeficient
coef_dict = dict(zip(lasso_list,list(list(lasso.coef_))))

In [0]:
sorted_importance = sorted(coef_dict.items(), key=operator.itemgetter(1))

In [0]:
sorted_importance

[('descrip_shop', -0.05343238240885633),
 ('lg', -0.039492323026538885),
 ('as', -0.03653328930893417),
 ('descrip_holiday', -0.030434505560318143),
 ('by', -0.02727159216092754),
 ('descrip_merch', -0.017308447555050878),
 ('pie', -0.016143714719320572),
 (1, -0.01229157732919619),
 ('descrip_get', -0.009737424293819439),
 ('!', -0.008136523616085568),
 ('and', -0.008039500325526439),
 ('to', -0.007031961289630765),
 ('descrip_perfect', -0.005835234152962451),
 ('with', -0.005700460812246873),
 ("'s", -0.0045229132643021044),
 ('way', -0.004027283886887344),
 ('your', -0.0038577231916164683),
 ('descrip_want', -0.0029707390794527075),
 (2, -0.0025454214930997186),
 ('salad', -0.0020056969595516234),
 ('descrip_recip', -0.001065593622734052),
 ('sweet', -0.0008604954384061963),
 ('recip', -0.0007389270173058755),
 ('descrip_network', -0.000449834068759545),
 ('usa', -8.395654252265675e-16),
 ('number', 0.0),
 ('a', 0.0),
 ('for', -0.0),
 ('how', -0.0),
 ('the', 0.0),
 (',', -0.0),
 ("'