## Notebook (Download, Data Cleaning, Data Visualization, Modeling)

In [None]:
# Import libraries
from bs4 import BeautifulSoup
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn

# Get and scrape the data
import requests

base_url = "https://fangj.github.io/friends/"

# Get the list of episode URLs
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
links = [link.get('href') for link in soup.find_all('a')]

# Define a function to extract the dialogs from an episode
def extract_dialogs(episode_url):
    response = requests.get(episode_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    lines = soup.get_text().split('\n')
    
    dialogs = []
    for line in lines:
        if ':' in line:
            character, dialogue = line.split(':', 1)
            character = character.strip()
            dialogue = dialogue.strip()
            dialogs.append((character, dialogue))
    return dialogs

# Scrape the data and save it season-wise
season_data = {}
for link in links:
    if link.startswith('season'):
        season_str = link.split('/')[-1].split('.')[0].split('ep')[0]

        # Handle edge cases
        if '-' in season_str:
            season_str = season_str.split('-')[0]
        if 'outtakes' in season_str:
            continue
        
        season = int(season_str)
        if season not in season_data:
            season_data[season] = []

        episode_url = base_url + link
        season_data[season].extend(extract_dialogs(episode_url))

# Process and clean the data
# Remove non-dialogue lines and non-main characters
main_characters = ["Monica", "Chandler", "Ross", "Rachel", "Phoebe", "Joey"]

cleaned_data = {}

for season, dialogs in season_data.items():
    cleaned_dialogs = [(character.title(), dialogue) for character, dialogue in dialogs if character.title() in main_characters]
    cleaned_data[season] = cleaned_dialogs
        
# Create folders
for num in range(1, 10):
    !mkdir data/Season_0{num}
!mkdir data/Season_10

# Save the data in csv per season
for season, dialogs in cleaned_data.items():
    if len(str(season)) == 3:
        with open(f"data/Season_0{str(season)[0]}/Episode_{str(season)[1:4]}.csv", "w", newline='', encoding='utf-8') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(['Character', 'Dialogue'])
            csv_writer.writerows(dialogs)
    else:
        with open(f"data/Season_{str(season)[0:2]}/Episode_{str(season)[2:5]}.csv", "w", newline='', encoding='utf-8') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(['Character', 'Dialogue'])
            csv_writer.writerows(dialogs)

print('Loaded successfully!')

In [None]:
# Concatinate all csv files and clean the data
import os
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import re

data_dir = 'data'

all_data = []

for season in os.listdir(data_dir):
    season_path = os.path.join(data_dir, season)
    for episode_file in os.listdir(season_path):
        episode_path = os.path.join(season_path, episode_file)
        episode_data = pd.read_csv(episode_path, names=["Character", "Dialogue"])
        episode_data['Season'] = int(season.split('_')[-1])
        episode_data['Episode'] = int(episode_file.split('_')[-1].split('.')[0])
        all_data.append(episode_data)
        
data = pd.concat(all_data, ignore_index=True)

# Sort the data 
data = data.sort_values(['Season', 'Episode']).reset_index(drop=True)
data = data[~data['Character'].str.contains('Character')].reset_index(drop=True)

In [None]:
# Clean the data
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", "not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"[^\w\s]", "", text) 
    text = re.sub(r"cmon", "come on", text)
    text = re.sub(r"donot", "do not", text)
    return text

data["Dialogue"] = data["Dialogue"].apply(clean_text)

In [None]:
data.to_csv('data/friends_data.csv', index=False)

In [None]:
## Data Visualization
# Character lines
character_lines = data['Character'].value_counts()
# Define a list of colors
colors = ["r", "purple", "b", "c", "m", "y"]

In [None]:
# Visualize number of lines per character (Bar Chart)
character_lines = data['Character'].value_counts()
top_characters = character_lines.head(10)
plt.figure(figsize=(12, 6))
plt.bar(top_characters.index, top_characters, color=colors)
plt.xlabel('characters')
plt.ylabel('Nunber of lines')
plt.title('Number of lines per charater')
plt.show()

In [None]:
# Number of lines per season
import matplotlib.pyplot as plt

lines_per_season = data['Season'].value_counts().sort_index()
plt.figure(figsize=(14, 8))
plt.bar(lines_per_season.index, lines_per_season.values, color=colors)
plt.xlabel('Season')
plt.ylabel('Nunber of lines')
plt.title('Number of lines per season')
plt.show()

In [None]:
# Top characters per season
seasons = data['Season'].unique()
character_lines = data['Character'].value_counts()
top_characters = character_lines.head(10).index

colors = ["r", "purple", "b", "c", "m", "y", "g", "orange", "brown"]

plt.figure(figsize=(12,6))
lefts = [0] * len(top_characters)
                    
for season, color in zip(seasons, colors):
    season_data = data[data['Season'] == season]
    character_lines_season = season_data["Character"].value_counts()
    top_characters_season = character_lines_season.loc[top_characters]
    
    plt.barh(top_characters_season.index, top_characters_season.values, left=lefts, color=color)
    lefts = [sum(x) for x in zip(lefts, top_characters_season.values)]
                    
plt.xlabel('Character')
plt.ylabel('Nunber of lines')
plt.title('Number of lines per character in season')
plt.legend(seasons, title='Season')
plt.show()

In [None]:
# Distribution of line lengths
data['Line_Length'] = data['Dialogue'].apply(lambda x: len(x.split()))
plt.figure(figsize=(6, 3))
plt.hist(data['Line_Length'], bins=5)
plt.xlabel('Line length (words)')
plt.ylabel('Frequency')
plt.title('Distribution of line lengths')
plt.show()                         

In [None]:
# Average line lenth per character
character_lines = data['Character'].value_counts()
top_characters = character_lines.head(10).index
data['Line_Length'] = data['Dialogue'].apply(lambda x: len(x.split()))
character_line_length = data.groupby('Character')['Line_Length'].mean().sort_values(ascending=False)
top_character_line_length = character_line_length.loc[top_characters]
plt.figure(figsize=(10, 5))
plt.bar(top_character_line_length.index, top_character_line_length.values, color=colors)
plt.xlabel('Character')
plt.ylabel('Average line length (words)')
plt.title('Average line length per character')
plt.show()          

In [None]:
# Sentiment analysis of dialogues
from textblob import TextBlob

def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

data['Sentiment'] = data['Dialogue'].apply(get_sentiment)

plt.figure(figsize=(10, 5))
plt.hist(data['Sentiment'], bins=10)
plt.xlabel('Sentiment polarity')
plt.ylabel('Frequency')
plt.title(f'Sentiment analysis of dialogues')
plt.show()

# Neutral 

In [None]:
# Modeling

In [None]:
# Text Classification

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Top characters with most lines
character_lines = data['Character'].value_counts()
top_characters = character_lines.head(10).index
filtered_data = data[data['Character'].isin(top_characters)]

X = filtered_data['Dialogue']
y = filtered_data['Character']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_test_tfidf)

print("Classification Report:")
print(classification_report(y_test, y_pred))
      
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Accuracy Score:")
print(accuracy_score(y_test, y_pred))      

In [None]:
# Save the model
import pickle
import numpy as np

filename = "models/text_classification_model.pkl"
pickle.dump(classifier, open(filename, 'wb'))

vectorizer_filename = 'models/vectorizer.pkl'
pickle.dump(vectorizer, open(vectorizer_filename, 'wb'))

# Load the saved model
loaded_model = pickle.load(open(filename, 'rb'))
loaded_vectorizer = pickle.load(open(vectorizer_filename, 'rb'))

# Test the model
X_text_transformed = loaded_vectorizer.transform(X_test)
y_pred = loaded_model.predict(X_text_transformed)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy}")

# Define a new dialogue
new_dialogue = "We were on a break!"

new_dialogue_transformed = loaded_vectorizer.transform([new_dialogue])
predicted_character = loaded_model.predict(new_dialogue_transformed)

print(f"The predicted character for the given dialogue is: {predicted_character[0]}")

# Get the probabilities for each class (character)
predicted_probabilities = loaded_model.predict_proba(new_dialogue_transformed)
max_prob_index = np.argmax(predicted_probabilities)
max_probability = predicted_probabilities[0][max_prob_index]
predicted_character = loaded_model.classes_[max_prob_index]

print(f"The predicted character for the given dialogue is: {predicted_character}")
print(f"The confidence probability of the prediction is: {max_probability:.2f}")