# Models to predict ratings

In [134]:
import pandas as pd
import numpy as np

friends = pd.read_csv('transcripts and metadata/friends_mastersheet.csv', index_col=0)

In [135]:
friends.head()

Unnamed: 0,Year_of_prod,Season,Episode Number,Episode_Title,Duration,Summary,Director,Stars,Votes,match,...,Phoebe,Ross,Rachel,Carol,Susan,Janice,Mike,Gunther,Ben,Emily
0,1994,1,1,The One Where Monica Gets a Roommate: The Pilot,22,"Monica and the gang introduce Rachel to the ""r...",James Burrows,8.3,7440,11,...,0.061248,0.144114,0.155643,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1994,1,2,The One with the Sonogram at the End,22,Ross finds out his ex-wife is pregnant. Rachel...,James Burrows,8.1,4888,12,...,0.0615,0.248342,0.165269,0.088068,0.044696,0.0,0.0,0.0,0.0,0.0
2,1994,1,3,The One with the Thumb,22,Monica becomes irritated when everyone likes h...,James Burrows,8.2,4605,13,...,0.145415,0.125624,0.102383,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1994,1,4,The One with George Stephanopoulos,22,Joey and Chandler take Ross to a hockey game t...,James Burrows,8.1,4468,14,...,0.111161,0.151849,0.162042,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1994,1,5,The One with the East German Laundry Detergent,22,"Eager to spend time with Rachel, Ross pretends...",Pamela Fryman,8.5,4438,15,...,0.095007,0.150061,0.141764,0.0,0.0,0.047483,0.0,0.0,0.0,0.0


In [136]:
resultlist = [['model', 'LOOCV mean score', 'std']]
books = pd.read_csv('books/friends_books_topics_model.csv', index_col=0)
sb = pd.read_csv('scriptbase/friends_scriptbase_topics.csv', index_col=0)
friends_no_topics = pd.read_csv('transcripts and metadata/friends_mastersheet.csv', index_col=0)

## Baselines

### dummy baseline

In [137]:
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

cv = LeaveOneOut()

X = friends.text
y = friends.Stars

vect = TfidfVectorizer()
reg = DummyRegressor(strategy="mean")
pipe = make_pipeline(vect, reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

resultlist.append(['dummy_baseline', np.mean(scores), np.std(scores)])

print(np.mean(scores), np.std(scores))

-0.3113667508113955 0.24655333378118335


### BOW baseline

In [138]:
from sklearn.linear_model import LinearRegression

cv = LeaveOneOut()

X = friends.text
y = friends.Stars

vect = TfidfVectorizer(max_features=5000)
reg = LinearRegression()
pipe = make_pipeline(vect, reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

resultlist.append(['BOW_baseline', np.mean(scores), np.std(scores)])

print(np.mean(scores), np.std(scores))

-0.32209667011984083 0.27848673086034265


In [139]:
model = pipe.fit(X, y)

In [140]:
feature_names = model[:-1].get_feature_names_out()

coefs = pd.DataFrame(
    model[-1].coef_,
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

plane       0.822873
bam         0.715183
la          0.617973
dessert     0.584831
backup      0.517409
              ...   
sperm      -0.448818
climb      -0.488773
everest    -0.495282
audition   -0.644059
vows       -1.060876
Name: Coefficients, Length: 5000, dtype: float64

### No topics

In [141]:
lst = ['Gary Halvorson',        
'Kevin Bright',          
'Michael Lembeck',       
'James Burrows',           
'Gail Mancuso',           
'Peter Bonerz',            
'David Schwimmer',         
'Ben Weiss']

def direr(wow):
    if wow not in lst:
        wow = wow.replace(wow,'Other')
    return wow

In [142]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

X = friends_no_topics.drop(columns=['Summary', 'Episode_Title', 'Duration', 'Stars', 'Votes', 'text', 'match', 'len in words', 'len_prep'])
X.Director = X.Director.apply(direr)
y = friends_no_topics.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))
resultlist.append(['no_topic_baseline', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-0.3094817257793724 0.2586888912565133


onehotencoder__Director_Kevin Bright       0.081545
standardscaler__Ross                       0.068645
standardscaler__Year_of_prod               0.042651
standardscaler__Monica                     0.037676
onehotencoder__Director_Michael Lembeck    0.036747
standardscaler__Janice                     0.025054
standardscaler__Rachel                     0.019724
onehotencoder__Director_James Burrows      0.013474
standardscaler__Emily                      0.010829
onehotencoder__Director_David Schwimmer    0.009618
standardscaler__Susan                      0.004725
standardscaler__Phoebe                     0.002840
standardscaler__Chandler                   0.001628
standardscaler__Joey                      -0.003325
standardscaler__Episode Number            -0.006939
onehotencoder__Director_Gary Halvorson    -0.009157
standardscaler__Ben                       -0.010835
standardscaler__Mike                      -0.015280
standardscaler__Season                    -0.019487
onehotencode

### features + tfidf, no topics

In [143]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer

X = friends_no_topics.drop(columns=['Summary', 'Episode_Title', 'Duration', 'Stars', 'Votes', 'match', 'len in words', 'len_prep'])
X.Director = X.Director.apply(direr)
y = friends_no_topics.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()
vect = TfidfVectorizer(max_features=5000)

categorical_features = [X.select_dtypes(include="object").columns[0]]
integer_features= X.select_dtypes(exclude="object").columns
text_features = X.select_dtypes(include="object").columns[1]

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features),
                                (vect, text_features),
                               remainder='passthrough')

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))
resultlist.append(['no_topic_tf_idf_baseline', np.mean(scores), np.std(scores)])

# model = pipe.fit(X, y)

# feature_names = model[:-1].get_feature_names_out()

# X_preprocessed = pd.DataFrame(
#     model[:-1].transform(X), columns=feature_names
# )

# coefs = pd.DataFrame(
#     model[-1].coef_* X_preprocessed.std(axis=0),
#     columns=["Coefficients"],
#     index=feature_names,
# )

# coefs["Coefficients"].sort_values(ascending=False)

-0.3919044810031503 0.32084187021713934


## Models all features

### Books corpus

In [144]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

X = books.drop(columns=['Summary', 'Episode_Title', 'Stars', 'Duration', 'Votes', 'text', 'match', 'len in words', 'len_prep'])
X.Director = X.Director.apply(direr)
y = books.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))

resultlist.append(['books_all_topics', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-0.962550080393687 0.9319766604665668


standardscaler__t113                    0.150788
standardscaler__t250                    0.130598
standardscaler__t49                     0.122455
onehotencoder__Director_Kevin Bright    0.117847
standardscaler__t198                    0.109308
                                          ...   
standardscaler__t120                   -0.123063
onehotencoder__Director_Other          -0.125573
standardscaler__t8                     -0.145634
standardscaler__t127                   -0.163767
standardscaler__Monica                 -0.198191
Name: Coefficients, Length: 325, dtype: float64

### scriptbase

In [145]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

X = sb.drop(columns=['Summary', 'Episode_Title', 'Votes', 'Duration', 'text', 'match', 'Stars', 'len in words', 'len_prep'])
X.Director = X.Director.apply(direr)
y = sb.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))

resultlist.append(['sc_all_topics', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-0.6443115921587788 0.527618122196445


standardscaler__t144                    0.123547
onehotencoder__Director_Kevin Bright    0.114594
standardscaler__t163                    0.107085
standardscaler__Chandler                0.104810
standardscaler__t259                    0.092218
                                          ...   
standardscaler__t69                    -0.109047
standardscaler__t264                   -0.109094
onehotencoder__Director_Other          -0.121730
standardscaler__t185                   -0.130466
standardscaler__Monica                 -0.219474
Name: Coefficients, Length: 325, dtype: float64

## models 50

### books

In [146]:
books_best_50 = ['t83', 't188', 't274', 't230', 't273', 't204', 't196',
       't176', 't7', 't120', 't151', 't258', 't139', 't70', 't110', 't97',
       't235', 't282', 't104', 't62', 't245', 't80', 't171', 't115', 't270',
       't109', 't150', 't269', 't158', 't203', 't59', 't295', 't175', 't51',
       't166', 't113', 't285', 't299', 't286', 't88', 't127', 't69', 't222',
       't272', 't34', 't58', 't153', 't271', 't259', 't73']

to_drop = [f't{i}' for i in range(0,300) if f't{i}' not in books_best_50]

In [147]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

columns = ['Summary', 'Episode_Title', 'Stars', 'Votes', 'Duration', 'text', 'match', 'len in words', 'len_prep'] + to_drop

X = books.drop(columns=columns)
X.Director = X.Director.apply(direr)
y = books.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))

resultlist.append(['books_50_topics', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-2012182.9037310749 30846174.774059273


standardscaler__Season                     0.090092
standardscaler__t34                        0.072028
onehotencoder__Director_Kevin Bright       0.071688
onehotencoder__Director_Michael Lembeck    0.068637
standardscaler__t113                       0.063209
                                             ...   
onehotencoder__Director_Ben Weiss         -0.047018
standardscaler__t110                      -0.051585
standardscaler__Year_of_prod              -0.055502
standardscaler__t59                       -0.056312
standardscaler__t151                      -0.058897
Name: Coefficients, Length: 75, dtype: float64

### scriptbase

In [148]:
sb_best_50 = ['t134', 't103', 't12', 't194', 't276', 't35', 't180',
       't222', 't32', 't191', 't299', 't64', 't202', 't288', 't275', 't260',
       't84', 't182', 't165', 't210', 't90', 't204', 't255', 't217', 't239',
       't205', 't26', 't86', 't7', 't119', 't296', 't67', 't265', 't152',
       't128', 't72', 't250', 't242', 't287', 't190', 't57', 't44', 't17',
       't132', 't189', 't240', 't193', 't21', 't138', 't130']

to_drop = [f't{i}' for i in range(0,300) if f't{i}' not in sb_best_50]

In [149]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

columns = ['Summary', 'Episode_Title', 'Stars', 'Votes', 'Duration', 'text', 'match', 'len in words', 'len_prep'] + to_drop

X = sb.drop(columns=columns)
X.Director = X.Director.apply(direr)
y = books.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))

resultlist.append(['sb_50_topics', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-0.29955877146325177 0.2469060386648278


standardscaler__Year_of_prod             0.262928
standardscaler__t26                      0.063666
onehotencoder__Director_James Burrows    0.049841
standardscaler__t103                     0.049039
standardscaler__t130                     0.048690
                                           ...   
standardscaler__t35                     -0.056943
standardscaler__t250                    -0.063082
standardscaler__t288                    -0.071445
standardscaler__t191                    -0.096714
standardscaler__Season                  -0.235347
Name: Coefficients, Length: 75, dtype: float64

## models 25

### books

In [150]:
books_best_25 = ['t83', 't188', 't274', 't230', 't273', 't204', 't196',
't176', 't7', 't120', 't151', 't258', 't139', 't70', 't110', 't97',
't235', 't282', 't104', 't62', 't245', 't80', 't171', 't115', 't270']

to_drop = [f't{i}' for i in range(0,300) if f't{i}' not in books_best_25]

In [151]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

columns = ['Summary', 'Episode_Title', 'Stars', 'Votes', 'Duration', 'text', 'match', 'len in words', 'len_prep'] + to_drop

X = books.drop(columns=columns)
X.Director = X.Director.apply(direr)
y = books.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))

resultlist.append(['books_25_topics', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-0.31161903231758425 0.2681056341199774


standardscaler__Year_of_prod               0.071412
onehotencoder__Director_Kevin Bright       0.065728
standardscaler__Ross                       0.044533
standardscaler__t273                       0.043950
standardscaler__t235                       0.042728
onehotencoder__Director_Michael Lembeck    0.042657
standardscaler__Phoebe                     0.032983
standardscaler__Janice                     0.029180
onehotencoder__Director_James Burrows      0.025194
standardscaler__t80                        0.024074
standardscaler__Rachel                     0.018525
standardscaler__t104                       0.012934
standardscaler__t83                        0.011986
standardscaler__Susan                      0.007846
standardscaler__Monica                     0.007313
standardscaler__Emily                      0.007296
standardscaler__Chandler                   0.000362
onehotencoder__Director_David Schwimmer    0.000046
standardscaler__Mike                      -0.000866
standardscal

### scriptbase

In [152]:
sb_best_25 = ['t134', 't103', 't12', 't194', 't276', 't35', 't180',
       't222', 't32', 't191', 't299', 't64', 't202', 't288', 't275', 't260',
       't84', 't182', 't165', 't210', 't90', 't204', 't255', 't217', 't239']

to_drop = [f't{i}' for i in range(0,300) if f't{i}' not in sb_best_25]

In [153]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

columns = ['Summary', 'Episode_Title', 'Stars', 'Votes', 'Duration', 'text', 'match', 'len in words', 'len_prep'] + to_drop

X = sb.drop(columns=columns)
X.Director = X.Director.apply(direr)
y = books.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))

resultlist.append(['sb_25_topics', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-0.3041366367497664 0.23962599005321444


standardscaler__Year_of_prod               0.169846
standardscaler__t204                       0.062844
standardscaler__t12                        0.058029
standardscaler__t134                       0.044298
standardscaler__Ross                       0.043994
onehotencoder__Director_Kevin Bright       0.038009
standardscaler__Phoebe                     0.037871
standardscaler__t255                       0.037047
standardscaler__Monica                     0.031911
standardscaler__t182                       0.031598
standardscaler__t103                       0.030346
onehotencoder__Director_James Burrows      0.029929
standardscaler__Janice                     0.029556
onehotencoder__Director_Michael Lembeck    0.027082
standardscaler__t165                       0.018409
standardscaler__Emily                      0.017134
standardscaler__Ben                        0.016602
standardscaler__t260                       0.014275
standardscaler__Rachel                     0.014046
standardscal

## Models 10

### books

In [154]:
sb_best_25 = ['t134', 't103', 't12', 't194', 't276', 't35', 't180',
       't222', 't32', 't191', 't299', 't64', 't202', 't288', 't275', 't260',
       't84', 't182', 't165', 't210', 't90', 't204', 't255', 't217', 't239']

to_drop = [f't{i}' for i in range(0,300) if f't{i}' not in sb_best_25[:10]]

In [155]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

columns = ['Summary', 'Episode_Title', 'Stars', 'Votes', 'Duration', 'text', 'match', 'len in words', 'len_prep'] + to_drop

X = books.drop(columns=columns)
X.Director = X.Director.apply(direr)
y = books.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))

resultlist.append(['books_10_topics', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-0.3189384744100546 0.27674480431414583


onehotencoder__Director_Kevin Bright       0.084235
standardscaler__Ross                       0.074257
standardscaler__Monica                     0.042283
onehotencoder__Director_Michael Lembeck    0.035034
standardscaler__t134                       0.029695
standardscaler__t180                       0.029548
standardscaler__Rachel                     0.028852
standardscaler__Janice                     0.023606
standardscaler__t191                       0.018060
standardscaler__Season                     0.013810
onehotencoder__Director_James Burrows      0.013491
standardscaler__t12                        0.012272
standardscaler__Emily                      0.009124
standardscaler__Susan                      0.008964
onehotencoder__Director_David Schwimmer    0.008257
standardscaler__Phoebe                     0.007207
standardscaler__t276                       0.006784
standardscaler__Year_of_prod               0.006714
standardscaler__Joey                       0.002728
standardscal

### scriptbase

In [156]:
sb_best_25 = ['t134', 't103', 't12', 't194', 't276', 't35', 't180',
       't222', 't32', 't191', 't299', 't64', 't202', 't288', 't275', 't260',
       't84', 't182', 't165', 't210', 't90', 't204', 't255', 't217', 't239']

to_drop = [f't{i}' for i in range(0,300) if f't{i}' not in sb_best_25[:10]]

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

columns = ['Summary', 'Episode_Title', 'Stars', 'Votes', 'Duration', 'text', 'match', 'len in words', 'len_prep'] + to_drop

X = sb.drop(columns=columns)
X.Director = X.Director.apply(direr)
y = books.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))

resultlist.append(['sb_10_topics', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-0.3071893944996791 0.24644930769839254


standardscaler__Ross                       0.064870
onehotencoder__Director_Kevin Bright       0.063553
standardscaler__t12                        0.061056
standardscaler__t134                       0.056028
onehotencoder__Director_Michael Lembeck    0.035281
standardscaler__Janice                     0.032148
onehotencoder__Director_James Burrows      0.026068
standardscaler__t103                       0.025968
standardscaler__Monica                     0.024495
standardscaler__Year_of_prod               0.021368
standardscaler__Rachel                     0.018521
standardscaler__Phoebe                     0.014562
standardscaler__Season                     0.010026
standardscaler__Emily                      0.007630
standardscaler__Mike                       0.003495
standardscaler__Chandler                   0.002006
standardscaler__Episode Number             0.001189
onehotencoder__Director_Gary Halvorson     0.000680
onehotencoder__Director_David Schwimmer   -0.001863
standardscal

In [157]:
resultlist

[['model', 'LOOCV mean score', 'std'],
 ['dummy_baseline', -0.3113667508113955, 0.24655333378118335],
 ['BOW_baseline', -0.32209667011984083, 0.27848673086034265],
 ['no_topic_baseline', -0.3094817257793724, 0.2586888912565133],
 ['no_topic_tf_idf_baseline', -0.3919044810031503, 0.32084187021713934],
 ['books_all_topics', -0.962550080393687, 0.9319766604665668],
 ['sc_all_topics', -0.6443115921587788, 0.527618122196445],
 ['books_50_topics', -2012182.9037310749, 30846174.774059273],
 ['sb_50_topics', -0.29955877146325177, 0.2469060386648278],
 ['books_25_topics', -0.31161903231758425, 0.2681056341199774],
 ['sb_25_topics', -0.3041366367497664, 0.23962599005321444],
 ['books_10_topics', -0.3189384744100546, 0.27674480431414583],
 ['sb_10_topics', -0.3071893944996791, 0.24644930769839254]]