In [None]:
Data loading

In [None]:
import csv

def load_data(file_path):
    data = []
    try:
        with open(file_path, mode='r', encoding='utf-8') as file:
            # Using DictReader makes each row a dictionary keyed by the header
            csv_reader = csv.DictReader(file)
            for row in csv_reader:
                data.append(row)
        return data
    except FileNotFoundError:
        return "Error: File not found. Check the path!"

# Usage
my_data = load_data('your_file.csv')
print(my_data[:2])  # Prints the first two rows

Er


Data preprocessing

In [None]:
def preprocess_data(raw_data):
    cleaned_data = []

    for row in raw_data:
        processed_row = {}

        # 1. Cleaning Strings (Lowercase and stripping whitespace)
        processed_row['name'] = row['name'].strip().lower()

        # 2. Type Casting with Error Handling (Converting 'age' to int)
        try:
            processed_row['age'] = int(row['age'])
        except (ValueError, TypeError):
            processed_row['age'] = 0  # Default value for missing/bad data

        # 3. Normalization (e.g., converting 'yes/no' to Boolean)
        processed_row['is_active'] = row['status'].strip().lower() == 'active'

        cleaned_data.append(processed_row)

    return cleaned_data

# Example Usage:
# raw_input = [{'name': ' Alice ', 'age': '30', 'status': 'Active'}]
# clean_output = preprocess_data(raw_input)

Exploratory data analysis

In [None]:
def perform_eda(data_list):
    # Sort data for median and range
    sorted_data = sorted([x for x in data_list if x is not None])
    n = len(sorted_data)

    # Basic Stats
    total_sum = sum(sorted_data)
    mean = total_sum / n
    minimum = sorted_data[0]
    maximum = sorted_data[-1]

    # Median calculation
    if n % 2 == 0:
        median = (sorted_data[n//2 - 1] + sorted_data[n//2]) / 2
    else:
        median = sorted_data[n//2]

    return {
        "Count": n,
        "Mean": round(mean, 2),
        "Median": median,
        "Min": minimum,
        "Max": maximum,
        "Range": maximum - minimum
    }

# Example use:
ages = [22, 25, 22, 30, 45, 90, 24]
print(perform_eda(ages))

{'Count': 7, 'Mean': 36.86, 'Median': 25, 'Min': 22, 'Max': 90, 'Range': 68}


User_movie matrix creation

In [None]:
raw_data = [
    {"user": "Alice", "movie": "Matrix", "rating": 5},
    {"user": "Alice", "movie": "Inception", "rating": 4},
    {"user": "Bob", "movie": "Matrix", "rating": 3},
    {"user": "Charlie", "movie": "Inception", "rating": 5},
    {"user": "Bob", "movie": "Interstellar", "rating": 4}
]

def create_user_movie_matrix(data):
    matrix = {}

    for entry in data:
        user = entry['user']
        movie = entry['movie']
        rating = entry['rating']

        # Initialize the user's dictionary if not present
        if user not in matrix:
            matrix[user] = {}

        # Assign the rating
        matrix[user][movie] = rating

    return matrix

user_movie_matrix = create_user_movie_matrix(raw_data)

# Accessing a value: What did Alice rate 'Matrix'?
print(user_movie_matrix['Alice'].get('Matrix', 0)) # Output: 5

5


Similarity calculation

In [None]:
import math

def euclidean_similarity(matrix, user1, user2):
    # Find movies both users have rated
    common_movies = [movie for movie in matrix[user1] if movie in matrix[user2]]

    if len(common_movies) == 0:
        return 0  # No common ground

    # Calculate sum of squared differences
    sum_of_squares = sum([pow(matrix[user1][movie] - matrix[user2][movie], 2)
                          for movie in common_movies])

    distance = math.sqrt(sum_of_squares)

    # To return a "similarity score" (0 to 1), where 1 is identical:
    return 1 / (1 + distance)

# Example:
# alice_bob_sim = euclidean_similarity(user_movie_matrix, 'Alice', 'Bob')

Movie recomandation logic

In [None]:
def get_recommendations(matrix, target_user):
    totals = {}
    sim_sums = {}

    for other_user in matrix:
        # Don't compare me to myself
        if other_user == target_user:
            continue

        sim = euclidean_similarity(matrix, target_user, other_user)

        # Ignore people with no commonality
        if sim <= 0:
            continue

        for movie, rating in matrix[other_user].items():
            # Only score movies the target user hasn't seen yet
            if movie not in matrix[target_user] or matrix[target_user][movie] == 0:
                # Similarity * Rating (The "Weight")
                totals.setdefault(movie, 0)
                totals[movie] += rating * sim

                # Sum of all similarities for this movie
                sim_sums.setdefault(movie, 0)
                sim_sums[movie] += sim

    # Create the normalized list of recommendations
    rankings = [(total / sim_sums[movie], movie) for movie, total in totals.items()]

    # Sort by score (highest first)
    rankings.sort(reverse=True)
    return rankings

# Usage:
# print(get_recommendations(user_movie_matrix, 'Alice'))

Content -based filtering

In [None]:
# 1. Item Data (Movie Profiles)
movies = {
    "Matrix": {"Action", "Sci-Fi"},
    "Inception": {"Sci-Fi", "Thriller", "Action"},
    "Toy Story": {"Animation", "Children"},
    "Interstellar": {"Sci-Fi", "Drama"},
    "The Dark Knight": {"Action", "Crime", "Drama"}
}

# 2. User History (What Alice liked)
alice_likes = ["Matrix", "Inception"]

def get_content_recommendations(target_likes, movie_db):
    # Create Alice's "Ideal Profile" (Union of tags from movies she liked)
    user_profile = set()
    for movie in target_likes:
        user_profile.update(movie_db[movie])

    recommendations = []

    for movie, tags in movie_db.items():
        if movie in target_likes:
            continue

        # Jaccard Similarity Logic
        intersection = len(user_profile.intersection(tags))
        union = len(user_profile.union(tags))
        score = intersection / union

        recommendations.append((score, movie))

    # Sort by score
    return sorted(recommendations, reverse=True)

# Result
print(get_content_recommendations(alice_likes, movies))

[(0.25, 'Interstellar'), (0.2, 'The Dark Knight'), (0.0, 'Toy Story')]


Performance analysis

In [None]:
import math

def evaluate_performance(test_data):
    """
    test_data: list of tuples (actual_rating, predicted_rating)
    Example: [(5, 4.2), (3, 3.8), (4, 4.0)]
    """
    n = len(test_data)
    if n == 0:
        return "No data to evaluate"

    absolute_errors = []
    squared_errors = []

    for actual, predicted in test_data:
        error = actual - predicted
        absolute_errors.append(abs(error))
        squared_errors.append(error ** 2)

    mae = sum(absolute_errors) / n
    rmse = math.sqrt(sum(squared_errors) / n)

    return {
        "MAE": round(mae, 4),
        "RMSE": round(rmse, 4)
    }

# Example usage:
# results = [(5, 4.5), (2, 2.8), (4, 3.9)]
# print(evaluate_performance(results))

Result display

In [None]:
def display_recommendations(user_name, recommendations):
    print(f"\n{'='*40}")
    print(f" TOP MOVIE PICKS FOR: {user_name.upper()} ")
    print(f"{'='*40}")
    print(f"{'Rank':<6} | {'Movie Title':<20} | {'Score'}")
    print(f"{'-'*40}")

    for i, (score, movie) in enumerate(recommendations, 1):
        # Format: Rank (6 chars), Title (20 chars), Score (2 decimals)
        print(f"{i:<6} | {movie:<20} | {score:.2f}")

    print(f"{'='*40}\n")

# Example Usage:
# results = [(4.921, "Inception"), (4.75, "The Godfather")]
# display_recommendations("Alice", results)