In [1]:
import pandas as pd
import numpy as np
from stop_words import get_stop_words
import re
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from matplotlib.textpath import TextPath
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering, KMeans
from scipy.cluster.hierarchy import dendrogram
from sklearn import mixture

In [2]:
#vereinfachte Methode:
def linkage_matrix(n_samples, children, distances):
    """
    create a linkage matrix for the dendogram method in scipy
    n_samples: int, number of samples
    children: list of lists, clustered data points (should be 2)
    distances: list of distances between nodes
    """
    # Create linkage matrix
    

    # create the counts of samples under each node
    counts = np.zeros(children.shape[0])
    for i, merge in enumerate(children):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    return np.column_stack([children, distances, counts]).astype(float)

In [3]:
df = pd.read_csv('../dataset/movies_complete.csv')
#dfs ist ein Verweis auf den ganzen Dataframe, weil df im folgenden gesliced wird
dfs = df.sample(frac=1)
dfs.shape

(3728, 56)

In [4]:
#df.dropna(subset=['text', 'genre', 'year', 'production_region'], inplace=True)
dfs.drop_duplicates(subset=['IMDB_ID'], inplace=True)
dfs.shape
df.drop_duplicates(subset=['IMDB_ID'], inplace=True)
dfs.shape

(3728, 56)

In [5]:
dfs.head()

Unnamed: 0.1,Unnamed: 0,filename,text,IMDB_ID,genre,year,production_region,corpus,duration,directors,...,Ratings,Metascore,imdbRating,imdbVotes,Type,DVD,BoxOffice,Production,Website,Response
1396,1396,4494465.xml,"IM DUNKELN sind alle Wölfe grau Pfui Deibel , ...",tt2102465,"Action,Crime,Thriller",2011.0,Norwegian,untokenisiert,87.0,nm1103121,...,"[{'Source': 'Internet Movie Database', 'Value'...",,6.6,1146,movie,,,,,True
3210,3210,6696047.xml,"Es war das Jahr , an das sich alle erinnerten ...",tt3838728,Drama,2015.0,French,untokenisiert,98.0,nm0404067,...,"[{'Source': 'Internet Movie Database', 'Value'...",53.0,5.5,2416,movie,,,Full House Films,,True
1888,1888,4562212.xml,"2012 Nach 2 Jahren der "" Rettung "" haben die ""...",tt2385027,Documentary,2012.0,Greek,untokenisiert,87.0,"nm4414980,nm4415168",...,"[{'Source': 'Internet Movie Database', 'Value'...",,8.1,435,movie,,,,,True
3532,3532,6729676.xml,"Ok . Ah , das ist es . - Ok . - Ok . - Na also...",tt4438848,Comedy,2016.0,,untokenisiert,92.0,nm0831557,...,"[{'Source': 'Internet Movie Database', 'Value'...",58.0,5.7,109853,movie,20 Sep 2016,,Universal Pictures,,True
3498,3498,6717187.xml,"- Scheiße , ich werde dich vermissen . - Ja . ...",tt4074364,Drama,2016.0,,untokenisiert,105.0,nm1323584,...,"[{'Source': 'Internet Movie Database', 'Value'...",58.0,5.8,3418,movie,,,Film Väst,,True


# Clustering über Zeit mit Embeddings

In [6]:
from pathlib import Path
import numpy as np
def read_embeddings(embedding_file: str):
    data = Path(embedding_file).read_text()
    lines = data.split('\n')
    filenames = []
    embeddings = []
    for line in lines:
        line_data = line.split(' ')
        if len(line_data) >=2 :
            filenames.append(line_data[0])
            embeddings.append(list(map(float, line_data[1:])))
    return np.asarray(filenames), np.asarray(embeddings)

In [7]:
filenames, embeddings = read_embeddings('../dataset/embeddings.txt')

FileNotFoundError: [Errno 2] No such file or directory: '..\\dataset\\embeddings.txt'

In [None]:
filenames.shape, embeddings.shape

In [None]:
df = df[df['filename'].isin(filenames)].copy()
df.shape

In [None]:
(df.filename == filenames).all()

In [None]:
#Einfügen der Embeddings ins Dataframe
df['embedding'] = [e for e in embeddings]
df.shape

## Plot über Zeit

In [None]:
pd.value_counts(df['year']).plot.bar()
print('Mean: ', df['year'].mean(), 'Median: ', df['year'].median())

## Split der Daten in Zeitperioden

In [None]:
#sincevalue = 2017
#beforevalue = 2000
#since = df.year >= sincevalue
#before = df.year <= beforevalue
#years = since + before
#years.shape
colorlist = []

for x in df.year:
    if x >=2017:
        colorlist.append('red')
    elif x <= 2007:
        colorlist.append('blue')
    else: 
        colorlist.append('white')

In [None]:
df['labelcolor']=colorlist

## K-Means

In [None]:
model = KMeans(n_clusters=2)
model.fit(embeddings)

In [None]:
len(model.labels_)

## PCA

In [None]:
pca = PCA(n_components=2)
pca.fit(embeddings)
X_tf = pca.transform(embeddings)

#Abspeichern der Zwischenergebnisste im Datafram
df['pca2d_1'] = X_tf[:,0]
df['pca2d_2'] = X_tf[:,1]
df['kmeans'] = model.labels_

#Plotten anhand der gespeicherten Zwischenergebnisse 
plt.figure(figsize=(15,8))
for d in df.iterrows():
    x = d[1].pca2d_1
    y = d[1].pca2d_2    
    plt.plot(x, y, 'o', markersize=4, color= d[1].labelcolor)
    plt.plot(x, y, marker=TextPath((-3, -3), f'{d[1].kmeans}'), color='black', markersize=1)


In [None]:
pca = PCA(n_components=2)
pca.fit(embeddings)
X_tf = pca.transform(embeddings)
print(len(X_tf))
df['pca2d_1'] = X_tf[:,0]
df['pca2d_2'] = X_tf[:,1]
df['kmeans'] = model.labels_

df_clean = df[df.labelcolor != 'white']
#df_embedding = df_clean['embedding']
#df_embedding.tolist()

plt.figure(figsize=(15,8))
for d in df_clean.iterrows():
    x = d[1].pca2d_1
    y = d[1].pca2d_2    
    plt.plot(x, y, 'o', markersize=8, color= d[1].labelcolor)
    plt.plot(x, y, marker=TextPath((-3, -3), f'{d[1].kmeans}'), color='white', markersize=6)


### Verteilung der Ergebnisse

In [None]:
b1 = df.loc[(df['kmeans'] == 1) & (df['labelcolor'] == 'blue')]
b0 = df.loc[(df['kmeans'] == 0) & (df['labelcolor'] == 'blue')]
r1 = df.loc[(df['kmeans'] == 1) & (df['labelcolor'] == 'red')]
r0 = df.loc[(df['kmeans'] == 0) & (df['labelcolor'] == 'red')]
print('b1: ', len(b1), 'b0: ', len(b0), 'r1: ', len(r1), 'r0: ', len(r0))

# Ohne Embeddings

# Zwei Cluster

In [None]:
dfs.dropna(subset=['text', 'year', 'genre', 'duration', 'runtimeMinutes'], inplace=True)
plt.figure(figsize=(15,8))
pd.value_counts(dfs['year']).plot.bar()
dfs.shape

# Einschub Titletype - Was ist in Filme enthalten?

## Movie VS. TVMovie

In [None]:
# remove "ein Netflix Original" 

In [None]:
pd.value_counts(dfs['titleType']).plot.bar()
pd.value_counts(dfs['titleType'])

## Movie

In [None]:
dfs_movie = dfs.loc[dfs['titleType'] == 'movie']
dfs_movie.shape
plt.figure(figsize=(15,8))
pd.value_counts(dfs_movie['genre']).plot.bar()

In [None]:
from collections import Counter
genre_combination_frequs = Counter(dfs_movie.genre)
genre_combination_frequs.most_common(20)
#len(genre_combination_frequs)

In [None]:
pd.value_counts(dfs_movie['genre'])

In [None]:
plt.figure(figsize=(15,8))
pd.value_counts(dfs_movie['year']).plot.bar()

# TV Movie

In [None]:
dfs_tvmovie = dfs.loc[dfs['titleType'] == 'tvMovie']
dfs_tvmovie
plt.figure(figsize=(15,8))
pd.value_counts(dfs_tvmovie['genre']).plot.bar()

In [None]:
from collections import Counter
genre_combination_frequs = Counter(dfs_tvmovie.genre)
genre_combination_frequs.most_common(20)

In [None]:
dfs_series_long = dfs.loc[(dfs['titleType'] == 'tvEpisode') & (dfs['runtimeMinutes'].astype('int') >= 50)]
dfs_series_long
plt.figure(figsize=(15,8))
pd.value_counts(dfs_series_long['genre']).plot.bar()
dfs_series_long.shape

In [None]:
pd.value_counts(dfs_series_long['year']).plot.bar()

## Slicing 2 Cluster Datensatz

In [None]:
#Aufteilung in 1950 bis 1970 und 2016
colors = []
for x in dfs.year:
    if x == 2016:
        colors.append('red')
    elif (x <= 1970) & (x >= 1940):
        colors.append('blue')
    else: 
        colors.append('white')
dfs['colors']=colors
      
dfs_clean = dfs[dfs.colors != 'white']
pd.value_counts(dfs_clean.colors).plot.bar() 
pd.value_counts(dfs_clean.colors)

# K-Means mit TF-IDF

In [None]:
# TF-IDF
tfidf = TfidfVectorizer(max_features=10000)
tokens = tfidf.fit_transform(dfs_clean.text)
#vectorizer = CountVectorizer()
#X = vectorizer.fit_transform(df_split.text)

In [None]:
tokens.toarray(), tokens.shape

In [None]:
models = KMeans(n_clusters=2)
models.fit(tokens)

In [None]:
len(models.labels_)

In [None]:
tokens

In [None]:
pca = PCA(n_components=2)
pca.fit(tokens.todense())
X_tf = pca.transform(tokens.todense())

#Abspeichern der Zwischenergebnisste im Datafram
dfs_clean['pcatokens_1'] = X_tf[:,0]
dfs_clean['pcatokens_2'] = X_tf[:,1]
dfs_clean['kmeanstokens'] = models.labels_

#Plotten anhand der gespeicherten Zwischenergebnisse 
plt.figure(figsize=(15,8))
for d in dfs_clean.iterrows():
    x = d[1].pcatokens_1
    y = d[1].pcatokens_2    
    plt.plot(x, y, 'o', markersize=4, color= d[1].colors)
    plt.plot(x, y, marker=TextPath((-3, -3), f'{d[1].kmeanstokens}'), color='black', markersize=1)

In [None]:
dfs_cleans = dfs_clean.sample(frac=0.1)

tfidf = TfidfVectorizer(max_features=10000)
tokens2 = tfidf.fit_transform(dfs_cleans.text)

pca = PCA(n_components=2)
pca.fit(tokens2.todense())
X_tf = pca.transform(tokens2.todense())

#Abspeichern der Zwischenergebnisste im Datafram
dfs_cleans['pcatokens_1'] = X_tf[:,0]
dfs_cleans['pcatokens_2'] = X_tf[:,1]

#Plotten anhand der gespeicherten Zwischenergebnisse 
plt.figure(figsize=(15,8))
for d in dfs_cleans.iterrows():
    x = d[1].pcatokens_1
    y = d[1].pcatokens_2    
    plt.plot(x, y, 'o', markersize=8, color= d[1].colors)
    plt.plot(x, y, marker=TextPath((-3, -3), f'{d[1].kmeanstokens}'), color='white', markersize=6)

## Ergebnisverteilung

In [None]:
b1 = dfs_cleans.loc[(dfs_cleans['kmeanstokens'] == 1) & (dfs_cleans['colors'] == 'blue')]
b0 = dfs_cleans.loc[(dfs_cleans['kmeanstokens'] == 0) & (dfs_cleans['colors'] == 'blue')]
r1 = dfs_cleans.loc[(dfs_cleans['kmeanstokens'] == 1) & (dfs_cleans['colors'] == 'red')]
r0 = dfs_cleans.loc[(dfs_cleans['kmeanstokens'] == 0) & (dfs_cleans['colors'] == 'red')]
print('b1: ', len(b1), 'b0: ', len(b0), 'r1: ', len(r1), 'r0: ', len(r0))

# 3. Cluster

## Slicing 3 Cluster Datensatz

In [None]:
#Aufteilung in bis 1960, 1980 bis 1985 und 2017
colors = []
for x in dfs.year:
    if x <= 1960:
        colors.append('red')
    elif (x <= 1985) & (x >= 1980):
        colors.append('blue')
    elif x == 2017:
        colors.append('green')
    else: 
        colors.append('white')
dfs['colors']=colors
      
dfs_clean3 = dfs[dfs.colors != 'white']
pd.value_counts(dfs_clean3.colors).plot.bar() 
pd.value_counts(dfs_clean3.colors)

## K-Means

In [None]:
tfidf = TfidfVectorizer(max_features=10000)
tokens3 = tfidf.fit_transform(dfs_clean3.text)

In [None]:
models3 = KMeans(n_clusters=3)
models3.fit(tokens3)

In [None]:
pca = PCA(n_components=2)
pca.fit(tokens3.todense())
X_tf = pca.transform(tokens3.todense())

#Abspeichern der Zwischenergebnisste im Datafram
dfs_clean3['pcatokens3_1'] = X_tf[:,0]
dfs_clean3['pcatokens3_2'] = X_tf[:,1]
dfs_clean3['kmeanstokens3'] = models3.labels_

#Plotten anhand der gespeicherten Zwischenergebnisse 
plt.figure(figsize=(15,8))
for d in dfs_clean3.iterrows():
    x = d[1].pcatokens3_1
    y = d[1].pcatokens3_2    
    plt.plot(x, y, 'o', markersize=4, color= d[1].colors)
    plt.plot(x, y, marker=TextPath((-3, -3), f'{d[1].kmeanstokens3}'), color='black', markersize=1)

In [None]:
dfs_cleansample3 = dfs_clean3.sample(frac=0.1)

tfidf = TfidfVectorizer(max_features=10000)
tokens3c = tfidf.fit_transform(dfs_cleansample3.text)

pca = PCA(n_components=2)
pca.fit(tokens3c.todense())
X_tf = pca.transform(tokens3c.todense())

#Abspeichern der Zwischenergebnisste im Datafram
dfs_cleansample3['pcatokens3_1'] = X_tf[:,0]
dfs_cleansample3['pcatokens3_2'] = X_tf[:,1]

#Plotten anhand der gespeicherten Zwischenergebnisse 
plt.figure(figsize=(15,8))
for d in dfs_cleansample3.iterrows():
    x = d[1].pcatokens3_1
    y = d[1].pcatokens3_2    
    plt.plot(x, y, 'o', markersize=8, color= d[1].colors)
    plt.plot(x, y, marker=TextPath((-3, -3), f'{d[1].kmeanstokens3}'), color='white', markersize=6)

In [None]:
b1 = dfs_cleansample3.loc[(dfs_cleansample3['kmeanstokens3'] == 1) & (dfs_cleansample3['colors'] == 'blue')]
b0 = dfs_cleansample3.loc[(dfs_cleansample3['kmeanstokens3'] == 0) & (dfs_cleansample3['colors'] == 'blue')]
b2 = dfs_cleansample3.loc[(dfs_cleansample3['kmeanstokens3'] == 2) & (dfs_cleansample3['colors'] == 'blue')]
r1 = dfs_cleansample3.loc[(dfs_cleansample3['kmeanstokens3'] == 1) & (dfs_cleansample3['colors'] == 'red')]
r0 = dfs_cleansample3.loc[(dfs_cleansample3['kmeanstokens3'] == 0) & (dfs_cleansample3['colors'] == 'red')]
r2 = dfs_cleansample3.loc[(dfs_cleansample3['kmeanstokens3'] == 2) & (dfs_cleansample3['colors'] == 'red')]
g0 = dfs_cleansample3.loc[(dfs_cleansample3['kmeanstokens3'] == 0) & (dfs_cleansample3['colors'] == 'green')]
g1 = dfs_cleansample3.loc[(dfs_cleansample3['kmeanstokens3'] == 1) & (dfs_cleansample3['colors'] == 'green')]
g2 = dfs_cleansample3.loc[(dfs_cleansample3['kmeanstokens3'] == 2) & (dfs_cleansample3['colors'] == 'green')]
print('b1: ', len(b1), 'b0: ', len(b0), 'r1: ', len(r1), 'r0: ', len(r0))
print('    0 - 1 - 2 \ng: ', len(g0), len(g1), len(g2), '\nb: ', len(b0), len(b1), len(b2), '\nr: ', len(r0), len(r1), len(r2))

## Hierarchisches Clustering

In [None]:
# 1940-1970 und 2016
dfs_clean.shape
# Datensatz samplen
dfs_cleans = dfs_clean.sample(frac=0.008)
pd.value_counts(dfs_cleans.colors).plot.bar() 
pd.value_counts(dfs_cleans.colors)

In [None]:
tfidf = TfidfVectorizer(max_features=10000)
dfs_cleans_tfids = tfidf.fit_transform(dfs_cleans.text)

hier_model = AgglomerativeClustering(n_clusters=None, distance_threshold=0).fit(dfs_cleans_tfids.todense())

In [None]:
lmatrix = linkage_matrix(dfs_cleans_tfids.shape[0], hier_model.children_, hier_model.distances_)
plt.figure(figsize=(15,8))
dendrogram(lmatrix, labels=dfs_cleans['year'].values, leaf_font_size=10); #

Hellblau: 35, davon 23 mit Label 2016 und 12 mit Label zwischen 1940-1970 ---
Rot: 25, davon 5 mit Label 2016 und 20 mit Label zwischen 1940-1970 ---
Grün: 4, 2 mal 2016 und 1 mit Label zwischen 1940-1970

# Cluster mit zwei Variablen Zeit und Genre

In [None]:
dfs.dropna(subset=['text', 'year', 'genre', 'duration', 'runtimeMinutes'], inplace=True)
plt.figure(figsize=(15,8))
pd.value_counts(dfs['year']).plot.bar()
dfs.shape

In [None]:
plt.figure(figsize=(15,8))
pd.value_counts(dfs['genre']).plot.bar()

In [None]:
#Drama und horror bei 2017 und 2007
labelcolor2var = []
for index, row in dfs.iterrows():
    if (row['genre'] == 'Drama') & (row['year'] == 2017) == True:
        labelcolor2var.append('red')    #red
    elif (row['genre'] == 'Drama') & (row['year'] == 2007) == True:
        labelcolor2var.append('white') #orange
    elif (row['genre'] == 'Comedy') & (row['year'] == 2017) == True:
        labelcolor2var.append('blue')   #blue
    elif (row['genre'] == 'Comedy') & (row['year'] == 2007) == True:
        labelcolor2var.append('white')  #green
    else: 
        labelcolor2var.append('white')
dfs['labelcolor2var']=labelcolor2var    
        
dfs_2var = dfs[dfs.labelcolor2var != 'white']
pd.value_counts(dfs_2var.labelcolor2var).plot.bar() 
pd.value_counts(dfs_2var.labelcolor2var)      

## KMeans 2 Variablen Zeit und Genre

In [None]:
#tf-idf
tfidf = TfidfVectorizer(max_features=10000)
tokens_2var = tfidf.fit_transform(dfs_2var.text)

In [None]:
#KMeans
model_2var = KMeans(n_clusters=4)
model_2var.fit(tokens_2var)

In [None]:
pca = PCA(n_components=2)
pca.fit(tokens_2var.todense())
X_tf = pca.transform(tokens_2var.todense())

#Abspeichern der Zwischenergebnisste im Datafram
dfs_2var['pca2var1'] = X_tf[:,0]
dfs_2var['pca2var2'] = X_tf[:,1]
dfs_2var['kmeans2var'] = model_2var.labels_

#Plotten anhand der gespeicherten Zwischenergebnisse 
plt.figure(figsize=(15,8))
for d in dfs_2var.iterrows():
    x = d[1].pca2var1
    y = d[1].pca2var2    
    plt.plot(x, y, 'o', markersize=4, color= d[1].labelcolor2var)
    plt.plot(x, y, marker=TextPath((-3, -3), f'{d[1].kmeans2var}'), color='black', markersize=1)

## hierarchisches Clustering mit 2 Variablen Zeit und Genre

In [None]:
#tf-idf
tfidf = TfidfVectorizer(max_features=10000)
tokens_2var = tfidf.fit_transform(dfs_2var.text)

In [None]:
hier_model = AgglomerativeClustering(n_clusters=None, distance_threshold=0).fit(tokens_2var.todense())

In [None]:
lmatrix = linkage_matrix(tokens_2var.shape[0], hier_model.children_, hier_model.distances_)
plt.figure(figsize=(15,8))
dendrogram(lmatrix, labels=(dfs_2var['originalTitle'].values + '    ' + dfs_2var['labelcolor2var'].values), leaf_font_size=10);

In [None]:
# Blue ist Comedy, 2017 und rot ist Drama, 2017 ; orange ist Drama 2007 ; green ist Comedy 2007

## ohne Serien

In [None]:
dfs.dropna(subset=['text', 'year', 'genre', 'duration', 'runtimeMinutes'], inplace=True)
dfs_movie = dfs.loc[dfs['titleType'] == 'movie']

plt.figure(figsize=(15,8))
from collections import Counter
genre_combination_frequs = Counter(dfs_movie.genre)
genre_combination_frequs.most_common(50)

#pd.value_counts(dfs_movie['genre']).plot.bar()


In [None]:
plt.figure(figsize=(15,8))
pd.value_counts(dfs_movie['year']).plot.bar()

In [8]:
#Drama und horror bei 2017 und 2007
labelcolor2var = []
for index, row in dfs_movie.iterrows():
    if (row['genre'] == 'Documentary') == True:
        labelcolor2var.append('red')    #red
    elif (row['genre'] == 'Drama,Romance') & (row['year'] == 2007) == True:
        labelcolor2var.append('white') #orange
    elif (row['genre'] == 'Action,Crime,Drama') == True:
        labelcolor2var.append('blue')   #blue
    elif (row['genre'] == 'Horror') & (row['year'] == 2007) == True:
        labelcolor2var.append('white')  #green
    else: 
        labelcolor2var.append('white')
dfs_movie['labelcolor2var']=labelcolor2var    
dfs_movie = dfs_movie.loc[dfs_movie['originalTitle'] != 'Geisha vs ninja'] 
dfs_movie = dfs_movie.loc[dfs_movie['originalTitle'] != 'Dao jiàn xiào'] 
dfs_2var = dfs_movie[dfs_movie.labelcolor2var != 'white']
pd.value_counts(dfs_2var.labelcolor2var).plot.bar() 
pd.value_counts(dfs_2var.labelcolor2var)   

NameError: name 'dfs_movie' is not defined

In [9]:
#tf-idf
tfidf = TfidfVectorizer(max_features=1000) #, stop_words=get_stop_words('de'))
tokens_2var = tfidf.fit_transform(dfs_2var.text)
hier_model = AgglomerativeClustering(n_clusters=None, distance_threshold=0).fit(tokens_2var.todense())

NameError: name 'dfs_2var' is not defined

In [None]:
lmatrix = linkage_matrix(tokens_2var.shape[0], hier_model.children_, hier_model.distances_)
plt.figure(figsize=(15,8))
dendrogram(lmatrix, labels=(dfs_2var['originalTitle'].values + "     " + dfs_2var['genre'].values), leaf_font_size=9);

## Documentary word counts

In [None]:
documentary = ""
for film in dfs_movie.iterrows():
    if film[1].genre == "Documentary":
        documentary = documentary + film[1].text

docu_tokens = re.findall("\w+", documentary.lower())
print(docu_tokens)
count_docu = Counter(docu_tokens).most_common(20)
count_docu

## Word count by moviename

In [None]:
dfs_bytitle = dfs_movie.loc[dfs_movie['originalTitle'] == 'Contraband']
text=""
for film in dfs_bytitle.iterrows():
    text = text + film[1].text
title_tokens = re.findall("\w+", text.lower())
title_docu = Counter(title_tokens).most_common(20)
title_docu
#Quebrando o Tabu
#dfs_bytitle.head()
text

## Action, Crime, Drama word counts

In [None]:
acd = ""
for film in dfs_movie.iterrows():
    if film[1].genre == "Action,Crime,Drama":
        acd = acd + film[1].text

acd_tokens = re.findall("\w+", acd.lower())
count_acd = Counter(acd_tokens).most_common(20)
count_acd

In [None]:
wordfreq = []
for film in dfs_movie.iterrows():
    tokens = re.findall("\w+", film[1].text.lower())
    count = Counter(tokens).most_common(10)
    #print(film[1].primaryTitle + ":  ", count)
    wordfreq.append(count)
dfs_movie['wordfreq']=wordfreq
dfs_movie.head()

## GMM

In [None]:
#tf-idf
tfidf = TfidfVectorizer(max_features=1000)
tokens_2var = tfidf.fit_transform(dfs_2var.text)
gmm = mixture.GaussianMixture(n_components=2, covariance_type='diag')
gmm_label = gmm.fit_predict(tokens_2var.toarray())
gmm.converged_

In [None]:
gmm_label

In [None]:
pca = PCA(n_components=2)
pca.fit(tokens_2var.todense())
X_tf = pca.transform(tokens_2var.todense())

#Abspeichern der Zwischenergebnisste im Datafram
dfs_2var['gmm1'] = X_tf[:,0]
dfs_2var['gmm2'] = X_tf[:,1]
dfs_2var['gmmlabel'] = gmm_label
print('Comedy, Crime, Drama in rot VS Comedy, Horror in blau')
#Plotten anhand der gespeicherten Zwischenergebnisse 
plt.figure(figsize=(15,8))
for d in dfs_2var.iterrows():
    x = d[1].gmm1
    y = d[1].gmm2    
    plt.plot(x, y, 'o', markersize=14, color= d[1].labelcolor2var)
    plt.plot(x, y, marker=TextPath((-3, -3), f'{d[1].gmmlabel}'), color='white', markersize=12)