## 1. Connect to DB

In [40]:
import pandas as pd

### Get users data

user_info = pd.read_sql(
    """SELECT * FROM public.user_data""",
    
    con=conn
)

user_info.head()

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source
0,200,1,34,Russia,Degtyarsk,3,Android,ads
1,201,0,37,Russia,Abakan,0,Android,ads
2,202,1,17,Russia,Smolensk,4,Android,ads
3,203,0,18,Russia,Moscow,1,iOS,ads
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads


In [3]:
### Get posts data

posts_info = pd.read_sql(
    """SELECT * FROM public.post_text_df""",
    
    con=conn
)

posts_info.head()

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business


In [None]:
### Attempt to load the entire database resulted in freeze, seems like this table is too big

# feed_data = pd.read_sql(
#     """SELECT * FROM public.feed_data""",
#     con=conn
# )

# feed_data.head()

In [4]:
### Feed data table has 77 millions rows, it's too much to keep and process the whole data

count_feed_data = pd.read_sql(
    """SELECT count(*) FROM public.feed_data""",
    con=conn
)

count_feed_data.head()

Unnamed: 0,count
0,76892800


In [5]:
### Get limit amount of feed data

feed_data = pd.read_sql(
    """SELECT * FROM public.feed_data LIMIT 10000000""",
    con=conn
)

feed_data.head()

Unnamed: 0,timestamp,user_id,post_id,action,target
0,2021-12-12 15:42:24,123029,3326,view,0
1,2021-12-12 15:42:56,123029,5634,view,1
2,2021-12-12 15:45:48,123029,5634,like,0
3,2021-12-12 15:45:50,123029,6352,view,0
4,2021-12-12 15:47:39,123029,4134,view,0


## 2. Feature process

Seems like there is no evident additional features can be calculated for users data table.

So let's come to the posts info data. This table contains column with post's text which needs to be converted to numbers. One of the ways to handle unique texts values is to vecorize them with TF-IDF transformation

In [12]:
import re
import string

from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer


wnl = WordNetLemmatizer()

def preprocessing(line, token=wnl):
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    line = line.replace('\n\n', ' ').replace('\n', ' ')
    line = ' '.join([token.lemmatize(x) for x in line.split(' ')])
    return line


tfidf = TfidfVectorizer(
    stop_words='english',
    preprocessor=preprocessing
)

In [13]:
tfidf_data = (
    tfidf
    .fit_transform(posts_info['text'])
    .toarray()
)

tfidf_data

  % sorted(inconsistent)


array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.13271374, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.05060827, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [14]:
tfidf_data = pd.DataFrame(
    tfidf_data,
    index=posts_info.post_id,
    columns=tfidf.get_feature_names_out()
)

tfidf_data

Unnamed: 0_level_0,00,000,0001,000bn,000m,000s,000th,001,001and,001st,...,𝓫𝓮,𝓫𝓮𝓽𝓽𝓮𝓻,𝓬𝓸𝓾𝓻𝓽𝓼,𝓱𝓮𝓪𝓻𝓲𝓷𝓰,𝓶𝓪𝔂,𝓹𝓱𝔂𝓼𝓲𝓬𝓪𝓵,𝓼𝓸𝓸𝓷𝓮𝓻,𝓼𝓾𝓫𝓸𝓻𝓭𝓲𝓷𝓪𝓽𝓮,𝓽𝓱𝓮,𝓽𝓸
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.132714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.050608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7315,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7316,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7317,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7318,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
### Genearate features based on TF-IDF

posts_info['TotalTfIdf'] = tfidf_data.sum(axis=1).reset_index()[0]
posts_info['MaxTfIdf'] = tfidf_data.max(axis=1).reset_index()[0]
posts_info['MeanTfIdf'] = tfidf_data.mean(axis=1).reset_index()[0]

posts_info.head()

Unnamed: 0,post_id,text,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf
0,1,UK economy facing major risks\n\nThe UK manufa...,business,8.752692,0.495586,0.00019
1,2,Aids and climate top Davos agenda\n\nClimate c...,business,11.895813,0.307944,0.000259
2,3,Asian quake hits European shares\n\nShares in ...,business,12.689081,0.26177,0.000276
3,4,India power shares jump on debut\n\nShares in ...,business,6.622786,0.537713,0.000144
4,5,Lacroix label bought by US firm\n\nLuxury good...,business,6.352096,0.420251,0.000138


tfidf data conatains huge number of features, let's try to reduce data dimension to solve the problem

In [17]:
### Cluster texts

from sklearn.decomposition import PCA

def get_PCA(n_components, data):
    centered = data - data.mean()
    pca = PCA(n_components) 
    return pca.fit_transform(centered)

### Choose big value of n_components to cover as much variance as possible
pca_decomp = get_PCA(n_components = 50, data=tfidf_data)
pd.DataFrame(pca_decomp).sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
6209,-0.003317,0.01393,-0.005344,-0.010081,0.020722,0.040232,-0.021339,0.007126,-0.0254,-0.001981,...,-0.025661,-0.02361,-0.036382,0.040201,0.084185,0.043252,-0.026071,-0.013572,-0.007427,0.052455
3469,-0.134433,-0.10341,0.012716,0.016231,0.005541,-0.002917,0.012491,0.010666,0.010389,-0.000628,...,0.012833,-0.014671,0.013247,-0.012799,-0.000777,0.002359,0.01179,0.010023,0.002612,-0.016563
6061,0.215371,-0.040584,0.142051,-0.047714,0.00647,-0.016659,0.016994,0.06679,-0.023454,0.030067,...,0.005062,0.010367,0.010974,0.013254,0.025535,0.001683,-0.001329,0.013341,-0.024668,-0.037309
5503,0.046744,-0.009231,-0.031277,0.027425,-0.003336,-0.002959,-0.034403,-0.007601,-0.027939,-0.011343,...,0.032459,0.033043,-0.024726,-0.005797,-0.037784,0.018545,-0.0263,-0.025944,0.03375,-0.01789
2572,0.025941,0.084067,-0.008359,-0.081743,-0.077095,0.179832,-0.003476,-0.009584,0.006469,-0.03991,...,0.009041,-0.023364,0.041249,-0.004794,-0.024104,-0.016581,-0.039784,0.008469,-0.022321,0.000223


In [18]:
### Calculate distance to each cluster

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=15, random_state=0).fit(pca_decomp)

posts_info['TextCluster'] = kmeans.labels_

dists_columns = ['DistanceTo1thCluster',
                 'DistanceTo2thCluster',
                 'DistanceTo3thCluster',
                 'DistanceTo4thCluster',
                 'DistanceTo5thCluster',
                 'DistanceTo6thCluster',
                 'DistanceTo7thCluster',
                 'DistanceTo8thCluster',
                 'DistanceTo9thCluster',
                 'DistanceTo10thCluster',
                 'DistanceTo11thCluster',
                 'DistanceTo12thCluster',
                 'DistanceTo13thCluster',
                 'DistanceTo14thCluster',
                 'DistanceTo15thCluster']

dists_df = pd.DataFrame(
    data=kmeans.transform(pca_decomp),
    columns=dists_columns
)

dists_df.head()

Unnamed: 0,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,DistanceTo11thCluster,DistanceTo12thCluster,DistanceTo13thCluster,DistanceTo14thCluster,DistanceTo15thCluster
0,0.499615,0.512809,0.560243,0.478849,0.481111,0.430154,0.480598,0.563876,0.553681,0.477726,0.532672,0.198731,0.549088,0.562525,0.571421
1,0.415719,0.423953,0.493104,0.382253,0.384773,0.297316,0.396566,0.368079,0.473238,0.383134,0.448981,0.39241,0.486882,0.494096,0.465219
2,0.429273,0.436614,0.51648,0.39592,0.404194,0.324836,0.416902,0.527347,0.490778,0.415785,0.456461,0.311719,0.492815,0.502836,0.492642
3,0.409398,0.429355,0.508439,0.388239,0.387784,0.312206,0.412191,0.507847,0.473334,0.40022,0.451209,0.40715,0.500897,0.488364,0.491987
4,0.248083,0.276922,0.387563,0.200721,0.20238,0.125618,0.250529,0.396546,0.34436,0.258596,0.307139,0.338297,0.371758,0.362885,0.381152


In [21]:
### Merge all tables

posts_info = pd.concat((posts_info,dists_df), axis=1)

df = pd.merge(feed_data,
              posts_info,
              on='post_id',
              how='left')

df = pd.merge(df,
              user_info,
              on='user_id',
              how='left')

df.head()

Unnamed: 0,timestamp,user_id,post_id,action,target,text,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,...,DistanceTo13thCluster,DistanceTo14thCluster,DistanceTo15thCluster,gender,age,country,city,exp_group,os,source
0,2021-12-12 15:42:24,123029,3326,view,0,@kelliwardaz @realDonaldTrump As a physician i...,covid,3.044433,0.426839,6.6e-05,...,0.392399,0.321788,0.419958,1,42,Russia,Ufa,0,iOS,organic
1,2021-12-12 15:42:56,123029,5634,view,1,"First, a warning. How to Marry a Millionaire c...",movie,13.731914,0.242308,0.000299,...,0.413473,0.421532,0.415019,1,42,Russia,Ufa,0,iOS,organic
2,2021-12-12 15:45:50,123029,6352,view,0,I love this show. Its clever and very well act...,movie,6.152159,0.41518,0.000134,...,0.388414,0.400913,0.430881,1,42,Russia,Ufa,0,iOS,organic
3,2021-12-12 15:47:39,123029,4134,view,0,A Fairer and More Sustainable Post-COVID World...,covid,3.378108,0.385593,7.3e-05,...,0.419406,0.169599,0.447276,1,42,Russia,Ufa,0,iOS,organic
4,2021-12-12 15:50:10,123029,5989,view,0,The beauty of this film is evidenced in the gr...,movie,11.905695,0.381845,0.000259,...,0.469294,0.462355,0.487871,1,42,Russia,Ufa,0,iOS,organic


In [22]:
### Extract hour and month info from timestamp as these parameters can affect the target

df['hour'] = pd.to_datetime(df['timestamp']).apply(lambda x: x.hour)
df['month'] = pd.to_datetime(df['timestamp']).apply(lambda x: x.month)

df.head()

Unnamed: 0,timestamp,user_id,post_id,action,target,text,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,...,DistanceTo15thCluster,gender,age,country,city,exp_group,os,source,hour,month
0,2021-12-12 15:42:24,123029,3326,view,0,@kelliwardaz @realDonaldTrump As a physician i...,covid,3.044433,0.426839,6.6e-05,...,0.419958,1,42,Russia,Ufa,0,iOS,organic,15,12
1,2021-12-12 15:42:56,123029,5634,view,1,"First, a warning. How to Marry a Millionaire c...",movie,13.731914,0.242308,0.000299,...,0.415019,1,42,Russia,Ufa,0,iOS,organic,15,12
2,2021-12-12 15:45:50,123029,6352,view,0,I love this show. Its clever and very well act...,movie,6.152159,0.41518,0.000134,...,0.430881,1,42,Russia,Ufa,0,iOS,organic,15,12
3,2021-12-12 15:47:39,123029,4134,view,0,A Fairer and More Sustainable Post-COVID World...,covid,3.378108,0.385593,7.3e-05,...,0.447276,1,42,Russia,Ufa,0,iOS,organic,15,12
4,2021-12-12 15:50:10,123029,5989,view,0,The beauty of this film is evidenced in the gr...,movie,11.905695,0.381845,0.000259,...,0.487871,1,42,Russia,Ufa,0,iOS,organic,15,12


In [23]:
### Drop useless columns

df = df.drop([
    'action',
    'text'
],
    axis=1)

df = df.set_index(['user_id', 'post_id'])

df.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,target,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,...,DistanceTo15thCluster,gender,age,country,city,exp_group,os,source,hour,month
user_id,post_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
123029,3326,2021-12-12 15:42:24,0,covid,3.044433,0.426839,6.6e-05,4,0.168681,0.289473,0.405159,...,0.419958,1,42,Russia,Ufa,0,iOS,organic,15,12
123029,5634,2021-12-12 15:42:56,1,movie,13.731914,0.242308,0.000299,3,0.325296,0.211746,0.373201,...,0.415019,1,42,Russia,Ufa,0,iOS,organic,15,12
123029,6352,2021-12-12 15:45:50,0,movie,6.152159,0.41518,0.000134,3,0.301262,0.28108,0.412474,...,0.430881,1,42,Russia,Ufa,0,iOS,organic,15,12
123029,4134,2021-12-12 15:47:39,0,covid,3.378108,0.385593,7.3e-05,13,0.246007,0.326066,0.435149,...,0.447276,1,42,Russia,Ufa,0,iOS,organic,15,12
123029,5989,2021-12-12 15:50:10,0,movie,11.905695,0.381845,0.000259,1,0.37994,0.271023,0.456245,...,0.487871,1,42,Russia,Ufa,0,iOS,organic,15,12
123029,5445,2021-12-12 15:53:07,1,movie,12.231171,0.233494,0.000266,3,0.3434,0.265134,0.400012,...,0.415026,1,42,Russia,Ufa,0,iOS,organic,15,12
123029,1984,2021-12-12 15:53:38,0,tech,9.082277,0.447536,0.000197,9,0.43299,0.438248,0.519645,...,0.487616,1,42,Russia,Ufa,0,iOS,organic,15,12
123029,4013,2021-12-12 15:54:11,0,covid,3.399737,0.355856,7.4e-05,4,0.212291,0.300773,0.417898,...,0.428639,1,42,Russia,Ufa,0,iOS,organic,15,12
123029,6037,2021-12-12 15:54:55,0,movie,10.012673,0.300833,0.000218,1,0.297491,0.165729,0.373983,...,0.40717,1,42,Russia,Ufa,0,iOS,organic,15,12
123029,795,2021-12-12 15:57:47,0,entertainment,8.576143,0.48315,0.000186,3,0.287821,0.281967,0.374261,...,0.405689,1,42,Russia,Ufa,0,iOS,organic,15,12


## 3. Train model 

In [24]:
### Check data time interval

max(df.timestamp), min(df.timestamp)

(Timestamp('2021-12-29 23:44:39'), Timestamp('2021-10-01 06:01:40'))

In [25]:
### Split data

df_train = df[df.timestamp < '2021-12-15']
df_test = df[df.timestamp >= '2021-12-15']

df_train = df_train.drop('timestamp', axis=1)
df_test = df_test.drop('timestamp', axis=1)

X_train = df_train.drop('target', axis=1)
X_test = df_test.drop('target', axis=1)

y_train = df_train['target']
y_test = df_test['target']

y_train.shape, y_test.shape

((7436944,), (1496269,))

In [30]:
### Use CatBoost algorithm for model learning

from catboost import CatBoostClassifier

object_cols = [
    'topic', 'TextCluster', 'gender', 'country',
    'city', 'exp_group', 'hour', 'month',
    'os', 'source'
]

catboost = CatBoostClassifier(iterations=200,
                              learning_rate=1,
                              depth=3)

catboost.fit(X_train, y_train, object_cols)

0:	learn: 0.3539539	total: 5.47s	remaining: 18m 7s
1:	learn: 0.3481553	total: 9.16s	remaining: 15m 6s
2:	learn: 0.3468142	total: 12.9s	remaining: 14m 10s
3:	learn: 0.3457898	total: 17.8s	remaining: 14m 30s
4:	learn: 0.3454832	total: 22s	remaining: 14m 18s
5:	learn: 0.3451309	total: 25.7s	remaining: 13m 52s
6:	learn: 0.3448441	total: 29.7s	remaining: 13m 39s
7:	learn: 0.3445550	total: 33.2s	remaining: 13m 15s
8:	learn: 0.3417518	total: 37.5s	remaining: 13m 16s
9:	learn: 0.3409253	total: 41.2s	remaining: 13m 3s
10:	learn: 0.3408341	total: 45.4s	remaining: 13m
11:	learn: 0.3407293	total: 49s	remaining: 12m 48s
12:	learn: 0.3406729	total: 53.3s	remaining: 12m 46s
13:	learn: 0.3404560	total: 57.3s	remaining: 12m 40s
14:	learn: 0.3398563	total: 1m 1s	remaining: 12m 34s
15:	learn: 0.3397799	total: 1m 5s	remaining: 12m 32s
16:	learn: 0.3396732	total: 1m 9s	remaining: 12m 32s
17:	learn: 0.3395861	total: 1m 14s	remaining: 12m 30s
18:	learn: 0.3394588	total: 1m 18s	remaining: 12m 24s
19:	learn: 0

<catboost.core.CatBoostClassifier at 0x7f7b426712d0>

In [31]:
### Check ROC-AUC score

from sklearn.metrics import roc_auc_score


print(f"Train ROC-AUC score: {roc_auc_score(y_train, catboost.predict_proba(X_train)[:, 1])}")
print(f"Test ROC-AUC score: {roc_auc_score(y_test, catboost.predict_proba(X_test)[:, 1])}")

Качество на трейне: 0.6939094831420071
Качество на тесте: 0.6635775910878621


## 4. Save model and Upload processed features 

In [38]:
### Save model

catboost.save_model(
    'catboost_model_2',
    format="cbm"                  
)

In [34]:
### Upload features (processed posts feature only as other tables feature can be processed during service run)

posts_info.to_sql(    
   "n-ignatov-12.2",                    
    con=conn,                                         
    if_exists='replace',
    index=False
   )                               
                                   

In [36]:
### Check if new table is working

test = pd.read_sql(
    """SELECT * FROM "n-ignatov-12.2" """,
    con=conn
)

test_

Unnamed: 0,post_id,text,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,...,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,DistanceTo11thCluster,DistanceTo12thCluster,DistanceTo13thCluster,DistanceTo14thCluster,DistanceTo15thCluster
0,1,UK economy facing major risks\n\nThe UK manufa...,business,8.752692,0.495586,0.000190,11,0.499615,0.512809,0.560243,...,0.430154,0.480598,0.563876,0.553681,0.477726,0.532672,0.198731,0.549088,0.562525,0.571421
1,2,Aids and climate top Davos agenda\n\nClimate c...,business,11.895813,0.307944,0.000259,5,0.415719,0.423953,0.493104,...,0.297316,0.396566,0.368079,0.473238,0.383134,0.448981,0.392410,0.486882,0.494096,0.465219
2,3,Asian quake hits European shares\n\nShares in ...,business,12.689081,0.261770,0.000276,11,0.429273,0.436614,0.516480,...,0.324836,0.416902,0.527347,0.490778,0.415785,0.456461,0.311719,0.492815,0.502836,0.492642
3,4,India power shares jump on debut\n\nShares in ...,business,6.622786,0.537713,0.000144,5,0.409398,0.429355,0.508439,...,0.312206,0.412191,0.507847,0.473334,0.400220,0.451209,0.407150,0.500897,0.488364,0.491987
4,5,Lacroix label bought by US firm\n\nLuxury good...,business,6.352096,0.420251,0.000138,5,0.248083,0.276922,0.387563,...,0.125618,0.250529,0.396546,0.344360,0.258596,0.307139,0.338297,0.371758,0.362885,0.381152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7314,Although the likeliness of someone focusing on...,movie,10.321070,0.300440,0.000224,10,0.528246,0.448095,0.585605,...,0.496074,0.507980,0.601241,0.583400,0.533076,0.309049,0.583666,0.580253,0.593029,0.583416
7019,7315,"OK, I would not normally watch a Farrelly brot...",movie,6.703302,0.258412,0.000146,10,0.426087,0.370312,0.514723,...,0.400827,0.410245,0.530967,0.490018,0.446231,0.235563,0.502867,0.496346,0.500140,0.510559
7020,7317,I cant believe this film was allowed to be mad...,movie,5.628524,0.573061,0.000122,1,0.307197,0.140023,0.400942,...,0.286083,0.319233,0.465707,0.393260,0.352690,0.298911,0.424257,0.424774,0.407824,0.432028
7021,7318,The version I saw of this film was the Blockbu...,movie,6.950472,0.229953,0.000151,1,0.365604,0.226645,0.386934,...,0.319316,0.332349,0.462482,0.434414,0.380812,0.317524,0.443794,0.436076,0.453264,0.452098
