# Models to predict ratings

In [109]:
import pandas as pd
import numpy as np

friends = pd.read_csv('transcripts and metadata/friends_mastersheet.csv', index_col=0)

In [110]:
friends.head()

Unnamed: 0,Year_of_prod,Season,Episode Number,Episode_Title,Duration,Summary,Director,Stars,Votes,match,...,Phoebe,Ross,Rachel,Carol,Susan,Janice,Mike,Gunther,Ben,Emily
0,1994,1,1,The One Where Monica Gets a Roommate: The Pilot,22,"Monica and the gang introduce Rachel to the ""r...",James Burrows,8.3,7440,11,...,0.061248,0.144114,0.155643,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1994,1,2,The One with the Sonogram at the End,22,Ross finds out his ex-wife is pregnant. Rachel...,James Burrows,8.1,4888,12,...,0.0615,0.248342,0.165269,0.088068,0.044696,0.0,0.0,0.0,0.0,0.0
2,1994,1,3,The One with the Thumb,22,Monica becomes irritated when everyone likes h...,James Burrows,8.2,4605,13,...,0.145415,0.125624,0.102383,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1994,1,4,The One with George Stephanopoulos,22,Joey and Chandler take Ross to a hockey game t...,James Burrows,8.1,4468,14,...,0.111161,0.151849,0.162042,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1994,1,5,The One with the East German Laundry Detergent,22,"Eager to spend time with Rachel, Ross pretends...",Pamela Fryman,8.5,4438,15,...,0.095007,0.150061,0.141764,0.0,0.0,0.047483,0.0,0.0,0.0,0.0


In [111]:
resultlist = [['model', 'LOOCV mean score', 'std']]
books = pd.read_csv('books/friends_books_topics_model.csv', index_col=0)
sb = pd.read_csv('scriptbase/friends_scriptbase_topics.csv', index_col=0)
friends_no_topics = pd.read_csv('transcripts and metadata/friends_mastersheet.csv', index_col=0)

## Baselines

### dummy baseline

In [112]:
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

cv = LeaveOneOut()

X = friends.text
y = friends.Stars

vect = TfidfVectorizer()
reg = DummyRegressor(strategy="mean")
pipe = make_pipeline(vect, reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

resultlist.append(['dummy_baseline', np.mean(scores), np.std(scores)])

print(np.mean(scores), np.std(scores))

-0.3113667508113955 0.24655333378118335


### BOW baseline

In [113]:
from sklearn.linear_model import LinearRegression

cv = LeaveOneOut()

X = friends.text
y = friends.Stars

vect = TfidfVectorizer(max_features=5000)
reg = LinearRegression()
pipe = make_pipeline(vect, reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

resultlist.append(['BOW_baseline', np.mean(scores), np.std(scores)])

print(np.mean(scores), np.std(scores))

-0.32209667011984083 0.27848673086034265


In [114]:
model = pipe.fit(X, y)

In [115]:
feature_names = model[:-1].get_feature_names_out()

coefs = pd.DataFrame(
    model[-1].coef_,
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

plane       0.822873
bam         0.715183
la          0.617973
dessert     0.584831
backup      0.517409
              ...   
sperm      -0.448818
climb      -0.488773
everest    -0.495282
audition   -0.644059
vows       -1.060876
Name: Coefficients, Length: 5000, dtype: float64

### No topics

In [116]:
lst = ['Gary Halvorson',        
'Kevin Bright',          
'Michael Lembeck',       
'James Burrows',           
'Gail Mancuso',           
'Peter Bonerz',            
'David Schwimmer',         
'Ben Weiss']

def direr(wow):
    if wow not in lst:
        wow = wow.replace(wow,'Other')
    return wow

In [117]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

X = friends_no_topics.drop(columns=['Summary', 'Episode_Title', 'Stars', 'Votes', 'text', 'match', 'len in words', 'len_prep'])
X.Director = X.Director.apply(direr)
y = friends_no_topics.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))
resultlist.append(['no_topic_baseline', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-0.31045740095235536 0.2520076439096313


standardscaler__Duration                   7.327942e-02
standardscaler__Ross                       6.348915e-02
standardscaler__Year_of_prod               5.674526e-02
standardscaler__Monica                     3.224115e-02
standardscaler__Janice                     2.630090e-02
standardscaler__Rachel                     2.171353e-02
standardscaler__Susan                      5.199015e-03
standardscaler__Emily                      4.954355e-03
standardscaler__Phoebe                     3.096629e-03
standardscaler__Joey                      -3.279206e-03
standardscaler__Ben                       -5.451320e-03
standardscaler__Chandler                  -1.023249e-02
standardscaler__Mike                      -1.480190e-02
standardscaler__Episode Number            -2.506571e-02
standardscaler__Gunther                   -2.997079e-02
standardscaler__Carol                     -4.312124e-02
standardscaler__Season                    -5.107228e-02
onehotencoder__Director_David Schwimmer   -4.247

### features + tfidf, no topics

In [118]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer

X = friends_no_topics.drop(columns=['Summary', 'Episode_Title', 'Stars', 'Votes', 'match', 'len in words', 'len_prep'])
X.Director = X.Director.apply(direr)
y = friends_no_topics.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()
vect = TfidfVectorizer(max_features=5000)

categorical_features = [X.select_dtypes(include="object").columns[0]]
integer_features= X.select_dtypes(exclude="object").columns
text_features = X.select_dtypes(include="object").columns[1]

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features),
                                (vect, text_features),
                               remainder='passthrough')

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))
resultlist.append(['no_topic_tf_idf_baseline', np.mean(scores), np.std(scores)])

# model = pipe.fit(X, y)

# feature_names = model[:-1].get_feature_names_out()

# X_preprocessed = pd.DataFrame(
#     model[:-1].transform(X), columns=feature_names
# )

# coefs = pd.DataFrame(
#     model[-1].coef_* X_preprocessed.std(axis=0),
#     columns=["Coefficients"],
#     index=feature_names,
# )

# coefs["Coefficients"].sort_values(ascending=False)

-0.3827187959279087 0.31705072714911003


In [119]:
X.columns

Index(['Year_of_prod', 'Season', 'Episode Number', 'Duration', 'Director',
       'text', 'Monica', 'Joey', 'Chandler', 'Phoebe', 'Ross', 'Rachel',
       'Carol', 'Susan', 'Janice', 'Mike', 'Gunther', 'Ben', 'Emily'],
      dtype='object')

## Models all features

### Books corpus

In [120]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

X = books.drop(columns=['Summary', 'Episode_Title', 'Stars', 'Votes', 'text', 'match', 'len in words', 'len_prep'])
X.Director = X.Director.apply(direr)
y = books.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))

resultlist.append(['books_all_topics', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-0.9603450892037377 0.9452001763789


standardscaler__Duration                0.128057
standardscaler__t250                    0.127984
standardscaler__t113                    0.125014
onehotencoder__Director_Kevin Bright    0.109744
standardscaler__t49                     0.107743
                                          ...   
standardscaler__t15                    -0.113250
standardscaler__t8                     -0.121338
onehotencoder__Director_Other          -0.131412
standardscaler__t127                   -0.150214
standardscaler__Monica                 -0.196894
Name: Coefficients, Length: 326, dtype: float64

### scriptbase

In [121]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

X = sb.drop(columns=['Summary', 'Episode_Title', 'Votes', 'text', 'match', 'Stars', 'len in words', 'len_prep'])
X.Director = X.Director.apply(direr)
y = sb.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))

resultlist.append(['sc_all_topics', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-0.6151407057359037 0.4791936825021369


standardscaler__t144                    0.124055
standardscaler__t163                    0.113698
onehotencoder__Director_Kevin Bright    0.112174
standardscaler__t122                    0.095668
standardscaler__Duration                0.095617
                                          ...   
standardscaler__t69                    -0.093749
standardscaler__t264                   -0.096200
onehotencoder__Director_Other          -0.123167
standardscaler__t185                   -0.124983
standardscaler__Monica                 -0.223561
Name: Coefficients, Length: 326, dtype: float64

## models 50

### books

In [122]:
books_best_50 = ['t83', 't188', 't274', 't230', 't273', 't204', 't196',
       't176', 't7', 't120', 't151', 't258', 't139', 't70', 't110', 't97',
       't235', 't282', 't104', 't62', 't245', 't80', 't171', 't115', 't270',
       't109', 't150', 't269', 't158', 't203', 't59', 't295', 't175', 't51',
       't166', 't113', 't285', 't299', 't286', 't88', 't127', 't69', 't222',
       't272', 't34', 't58', 't153', 't271', 't259', 't73']

to_drop = [f't{i}' for i in range(0,300) if f't{i}' not in books_best_50]

In [123]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

columns = ['Summary', 'Episode_Title', 'Stars', 'Votes', 'text', 'match', 'len in words', 'len_prep'] + to_drop

X = books.drop(columns=columns)
X.Director = X.Director.apply(direr)
y = books.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))

resultlist.append(['books_50_topics', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-5091832.472855382 78056308.62585586


standardscaler__Duration                   0.080130
standardscaler__t34                        0.071589
standardscaler__t113                       0.065143
standardscaler__t73                        0.064742
onehotencoder__Director_Michael Lembeck    0.063624
                                             ...   
standardscaler__t62                       -0.044050
standardscaler__t110                      -0.046785
standardscaler__t158                      -0.050534
standardscaler__t151                      -0.056205
standardscaler__t59                       -0.060578
Name: Coefficients, Length: 76, dtype: float64

### scriptbase

In [124]:
sb_best_50 = ['t134', 't103', 't12', 't194', 't276', 't35', 't180',
       't222', 't32', 't191', 't299', 't64', 't202', 't288', 't275', 't260',
       't84', 't182', 't165', 't210', 't90', 't204', 't255', 't217', 't239',
       't205', 't26', 't86', 't7', 't119', 't296', 't67', 't265', 't152',
       't128', 't72', 't250', 't242', 't287', 't190', 't57', 't44', 't17',
       't132', 't189', 't240', 't193', 't21', 't138', 't130']

to_drop = [f't{i}' for i in range(0,300) if f't{i}' not in sb_best_50]

In [125]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

columns = ['Summary', 'Episode_Title', 'Stars', 'Votes', 'text', 'match', 'len in words', 'len_prep'] + to_drop

X = sb.drop(columns=columns)
X.Director = X.Director.apply(direr)
y = books.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))

resultlist.append(['sb_50_topics', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-0.29842354294531614 0.24172291941277052


standardscaler__Year_of_prod             0.267696
standardscaler__Duration                 0.070331
standardscaler__t103                     0.051990
standardscaler__t130                     0.049365
onehotencoder__Director_James Burrows    0.047437
                                           ...   
standardscaler__t250                    -0.059349
standardscaler__t35                     -0.059957
standardscaler__t288                    -0.065644
standardscaler__t191                    -0.092622
standardscaler__Season                  -0.252367
Name: Coefficients, Length: 76, dtype: float64

## models 25

### books

In [126]:
books_best_25 = ['t83', 't188', 't274', 't230', 't273', 't204', 't196',
't176', 't7', 't120', 't151', 't258', 't139', 't70', 't110', 't97',
't235', 't282', 't104', 't62', 't245', 't80', 't171', 't115', 't270']

to_drop = [f't{i}' for i in range(0,300) if f't{i}' not in books_best_25]

In [127]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

columns = ['Summary', 'Episode_Title', 'Stars', 'Votes', 'text', 'match', 'len in words', 'len_prep'] + to_drop

X = books.drop(columns=columns)
X.Director = X.Director.apply(direr)
y = books.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))

resultlist.append(['books_25_topics', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-0.3114153715736398 0.2630541759549018


standardscaler__Year_of_prod               0.089218
standardscaler__Duration                   0.074765
onehotencoder__Director_Kevin Bright       0.054542
standardscaler__t273                       0.048128
standardscaler__t235                       0.040072
onehotencoder__Director_Michael Lembeck    0.039455
standardscaler__Ross                       0.039346
standardscaler__Phoebe                     0.033669
standardscaler__Janice                     0.031512
onehotencoder__Director_James Burrows      0.022269
standardscaler__Rachel                     0.017430
standardscaler__t80                        0.012307
standardscaler__t83                        0.008797
standardscaler__Susan                      0.004825
onehotencoder__Director_David Schwimmer    0.002227
standardscaler__Monica                     0.001637
standardscaler__Ben                       -0.000678
standardscaler__Emily                     -0.002008
standardscaler__Mike                      -0.002063
standardscal

### scriptbase

In [128]:
sb_best_25 = ['t134', 't103', 't12', 't194', 't276', 't35', 't180',
       't222', 't32', 't191', 't299', 't64', 't202', 't288', 't275', 't260',
       't84', 't182', 't165', 't210', 't90', 't204', 't255', 't217', 't239']

to_drop = [f't{i}' for i in range(0,300) if f't{i}' not in sb_best_25]

In [129]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

columns = ['Summary', 'Episode_Title', 'Stars', 'Votes', 'text', 'match', 'len in words', 'len_prep'] + to_drop

X = sb.drop(columns=columns)
X.Director = X.Director.apply(direr)
y = books.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))

resultlist.append(['sb_25_topics', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-0.2974797918025092 0.2315328201205982


standardscaler__Year_of_prod               0.175997
standardscaler__Duration                   0.093107
standardscaler__t12                        0.065964
standardscaler__t204                       0.060286
standardscaler__Phoebe                     0.041857
standardscaler__t255                       0.040320
standardscaler__t134                       0.040178
standardscaler__t182                       0.036322
standardscaler__t103                       0.036162
standardscaler__Janice                     0.034240
standardscaler__Ross                       0.034126
standardscaler__t260                       0.025455
standardscaler__Ben                        0.024957
onehotencoder__Director_James Burrows      0.023801
standardscaler__t165                       0.020922
onehotencoder__Director_Kevin Bright       0.020815
standardscaler__Monica                     0.020462
onehotencoder__Director_Michael Lembeck    0.018144
standardscaler__Rachel                     0.015454
standardscal

## Models 10

### books

In [130]:
sb_best_25 = ['t134', 't103', 't12', 't194', 't276', 't35', 't180',
       't222', 't32', 't191', 't299', 't64', 't202', 't288', 't275', 't260',
       't84', 't182', 't165', 't210', 't90', 't204', 't255', 't217', 't239']

to_drop = [f't{i}' for i in range(0,300) if f't{i}' not in sb_best_25[:10]]

In [131]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

columns = ['Summary', 'Episode_Title', 'Stars', 'Votes', 'text', 'match', 'len in words', 'len_prep'] + to_drop

X = books.drop(columns=columns)
X.Director = X.Director.apply(direr)
y = books.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))

resultlist.append(['books_10_topics', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-0.3225936685541604 0.26939307682496816


standardscaler__Duration                   0.077861
onehotencoder__Director_Kevin Bright       0.074790
standardscaler__Ross                       0.067757
standardscaler__Monica                     0.035341
standardscaler__Rachel                     0.030315
standardscaler__t180                       0.029250
standardscaler__t134                       0.028703
onehotencoder__Director_Michael Lembeck    0.027042
standardscaler__Janice                     0.025601
standardscaler__t191                       0.021194
standardscaler__t12                        0.016842
standardscaler__Year_of_prod               0.016097
onehotencoder__Director_David Schwimmer    0.012326
onehotencoder__Director_James Burrows      0.011413
standardscaler__t276                       0.009943
standardscaler__Susan                      0.009228
standardscaler__Phoebe                     0.007119
standardscaler__Joey                       0.004133
standardscaler__t103                       0.002510
standardscal

### scriptbase

In [132]:
sb_best_25 = ['t134', 't103', 't12', 't194', 't276', 't35', 't180',
       't222', 't32', 't191', 't299', 't64', 't202', 't288', 't275', 't260',
       't84', 't182', 't165', 't210', 't90', 't204', 't255', 't217', 't239']

to_drop = [f't{i}' for i in range(0,300) if f't{i}' not in sb_best_25[:10]]

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

columns = ['Summary', 'Episode_Title', 'Stars', 'Votes', 'text', 'match', 'len in words', 'len_prep'] + to_drop

X = sb.drop(columns=columns)
X.Director = X.Director.apply(direr)
y = books.Stars

reg = LinearRegression()
hot = OneHotEncoder()
scal = StandardScaler()

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

prep = make_column_transformer((hot, categorical_features),
                                (scal, integer_features))

pipe = make_pipeline(prep,reg)

scores = cross_val_score(pipe, X, y, cv=cv, scoring = 'neg_root_mean_squared_error')

print(np.mean(scores), np.std(scores))

resultlist.append(['sb_10_topics', np.mean(scores), np.std(scores)])

model = pipe.fit(X, y)

feature_names = model[:-1].get_feature_names_out()

X_preprocessed = pd.DataFrame(
    model[:-1].transform(X), columns=feature_names
)

coefs = pd.DataFrame(
    model[-1].coef_* X_preprocessed.std(axis=0),
    columns=["Coefficients"],
    index=feature_names,
)

coefs["Coefficients"].sort_values(ascending=False)

-0.3038873936801926 0.23943307322039978


standardscaler__Duration                   0.083394
standardscaler__t12                        0.068854
standardscaler__Ross                       0.058842
standardscaler__t134                       0.054272
onehotencoder__Director_Kevin Bright       0.050282
standardscaler__Janice                     0.034728
standardscaler__t103                       0.031219
standardscaler__Year_of_prod               0.031183
onehotencoder__Director_Michael Lembeck    0.026734
onehotencoder__Director_James Burrows      0.023358
standardscaler__Rachel                     0.021272
standardscaler__Monica                     0.016815
standardscaler__Phoebe                     0.015975
onehotencoder__Director_Gary Halvorson     0.005538
standardscaler__Mike                       0.004083
onehotencoder__Director_David Schwimmer    0.002160
standardscaler__Ben                        0.002017
standardscaler__Emily                      0.000616
standardscaler__Gunther                   -0.003349
onehotencode

In [133]:
resultlist

[['model', 'LOOCV mean score', 'std'],
 ['dummy_baseline', -0.3113667508113955, 0.24655333378118335],
 ['BOW_baseline', -0.32209667011984083, 0.27848673086034265],
 ['no_topic_baseline', -0.31045740095235536, 0.2520076439096313],
 ['no_topic_tf_idf_baseline', -0.3827187959279087, 0.31705072714911003],
 ['books_all_topics', -0.9603450892037377, 0.9452001763789],
 ['sc_all_topics', -0.6151407057359037, 0.4791936825021369],
 ['books_50_topics', -5091832.472855382, 78056308.62585586],
 ['sb_50_topics', -0.29842354294531614, 0.24172291941277052],
 ['books_25_topics', -0.3114153715736398, 0.2630541759549018],
 ['sb_25_topics', -0.2974797918025092, 0.2315328201205982],
 ['books_10_topics', -0.3225936685541604, 0.26939307682496816],
 ['sb_10_topics', -0.3038873936801926, 0.23943307322039978]]