Pandas on Spark version of the original transformations (no saves).

In [5]:
import pyspark.pandas as ps

ruta_carpeta = '/home/jovyan/work/datasets/Originales'

In [10]:
# Movies
df_movies = ps.read_csv(f'{ruta_carpeta}/movies.dat', sep='::', engine='python', header=None, names=['filmId', 'film', 'genders'])
print('Filas movies:', df_movies.shape[0])
df_movies.head()



Filas movies: 3883


Unnamed: 0,filmId,film,genders
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
# Limpieza de movies
df_movies_mod = df_movies.copy()
# Extraemos el año con replace (str.extract no está soportado en esta versión)
df_movies_mod['year'] = ps.to_numeric(
    df_movies_mod['film'].str.replace(r'.*\((\d{4})\).*', r'\1', regex=True),
    errors='coerce'
)
# Quitamos el año del título
df_movies_mod['film'] = df_movies_mod['film'].str.replace(r'\s*\(\d{4}\)$', '', regex=True)
# Géneros como lista (evitamos str.split directo)
df_movies_mod['genders'] = df_movies_mod['genders'].apply(
    lambda s: [] if s is None else [g for g in str(s).split('|') if g]
)
df_movies_mod[['film', 'year', 'genders']].head()




Unnamed: 0,film,year,genders
0,Toy Story,1995.0,"[Animation, Children's, Comedy]"
1,Jumanji,1995.0,"[Adventure, Children's, Fantasy]"
2,Grumpier Old Men,1995.0,"[Comedy, Romance]"
3,Waiting to Exhale,1995.0,"[Comedy, Drama]"
4,Father of the Bride Part II,1995.0,[Comedy]


In [13]:
# Conteo de generos
genre_counts = (
    df_movies_mod['genders']
    .explode()
    .dropna()
    .value_counts()
    .reset_index()
    .rename(columns={'index': 'genre', 'genders': 'count'})
)
genre_counts.head(20)



Unnamed: 0,genre,count
0,Drama,1603
1,Comedy,1200
2,Action,503
3,Thriller,492
4,Romance,471
5,Horror,343
6,Adventure,283
7,Sci-Fi,276
8,Children's,251
9,Crime,211


In [14]:
# Users
df_users = ps.read_csv(f'{ruta_carpeta}/users.dat', sep='::', engine='python', header=None, names=['userId', 'gender', 'age', 'occupation', 'zip'])
print('Filas users:', df_users.shape[0])
df_users.head()



Filas users: 6040


Unnamed: 0,userId,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [15]:
# Transformaciones sobre users
occupation_map = {
    0: 'Otro / No especificado', 1: 'Academico / Educador', 2: 'Artista',
    3: 'Administrativo / Oficina', 4: 'Estudiante universitario / Postgrado',
    5: 'Atencion al cliente', 6: 'Medico / Sector salud', 7: 'Ejecutivo / Gerente',
    8: 'Agricultor', 9: 'Amo/a de casa', 10: 'Estudiante (Escuela/Instituto)',
    11: 'Abogado', 12: 'Programador', 13: 'Jubilado', 14: 'Ventas / Marketing',
    15: 'Cientifico', 16: 'Autonomo', 17: 'Tecnico / Ingeniero', 18: 'Artesano / Oficio manual',
    19: 'Desempleado', 20: 'Escritor'
}

age_groups = {
    1: (17, 'A'),
    18: (21, 'B'),
    25: (30, 'C'),
    35: (40, 'D'),
    45: (47, 'E'),
    50: (52, 'F'),
    56: (60, 'G'),
}

zip_state_ranges = [
    (350, 369, 'AL'), (995, 999, 'AK'), (850, 865, 'AZ'), (716, 729, 'AR'),
    (900, 961, 'CA'), (800, 816, 'CO'), (60, 69, 'CT'), (197, 199, 'DE'),
    (200, 200, 'DC'), (202, 205, 'DC'), (569, 569, 'DC'), (320, 349, 'FL'),
    (300, 319, 'GA'), (967, 968, 'HI'), (832, 838, 'ID'), (600, 629, 'IL'),
    (460, 479, 'IN'), (500, 528, 'IA'), (660, 679, 'KS'), (400, 427, 'KY'),
    (700, 715, 'LA'), (39, 49, 'ME'), (206, 219, 'MD'), (10, 27, 'MA'),
    (480, 499, 'MI'), (550, 567, 'MN'), (386, 397, 'MS'), (630, 658, 'MO'),
    (590, 599, 'MT'), (680, 693, 'NE'), (889, 898, 'NV'), (30, 38, 'NH'),
    (70, 89, 'NJ'), (870, 884, 'NM'), (100, 149, 'NY'), (270, 289, 'NC'),
    (580, 588, 'ND'), (430, 459, 'OH'), (730, 749, 'OK'), (970, 979, 'OR'),
    (150, 196, 'PA'), (28, 29, 'RI'), (290, 299, 'SC'), (570, 577, 'SD'),
    (370, 385, 'TN'), (750, 799, 'TX'), (885, 885, 'TX'), (840, 847, 'UT'),
    (50, 59, 'VT'), (201, 201, 'VA'), (220, 246, 'VA'), (980, 994, 'WA'),
    (247, 268, 'WV'), (530, 549, 'WI'), (820, 831, 'WY'), (6, 9, 'PR'),
    (8, 8, 'VI'), (969, 969, 'GU')
]

def zip_to_state(zip_code):
    if not isinstance(zip_code, str):
        return None
    digits = ''.join(ch for ch in zip_code if ch.isdigit())
    if len(digits) < 3:
        return None
    prefix = int(digits[:3])
    for low, high, state in zip_state_ranges:
        if low <= prefix <= high:
            return state
    return None

df_users_mod = df_users.copy()
df_users_mod['occupation'] = df_users_mod['occupation'].map(occupation_map).astype('string')
df_users_mod['age_group_mean'] = df_users_mod['age'].map(lambda v: age_groups.get(v, (None, None))[0])
df_users_mod['age_group_letter'] = df_users_mod['age'].map(lambda v: age_groups.get(v, (None, None))[1])
df_users_mod['state'] = df_users_mod['zip'].apply(zip_to_state)
df_users_mod['state'] = df_users_mod['state'].fillna('Other')
df_users_mod = df_users_mod.drop(columns=['age', 'zip'])
df_users_mod.head()

Unnamed: 0,userId,gender,occupation,age_group_mean,age_group_letter,state
0,1,F,Estudiante (Escuela/Instituto),17,A,MI
1,2,M,Autonomo,60,G,LA
2,3,M,Cientifico,30,C,MN
3,4,M,Ejecutivo / Gerente,47,E,MA
4,5,M,Escritor,30,C,MN


In [16]:
# Ratings
df_ratings = ps.read_csv(f'{ruta_carpeta}/ratings.dat', sep='::', engine='python', header=None, names=['userId', 'filmId', 'rating', 'timestamp'])
print('Filas ratings:', df_ratings.shape[0])
df_ratings.head()



Filas ratings: 1000209


Unnamed: 0,userId,filmId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [17]:
# Convertir timestamp y unir
df_ratings['date'] = ps.to_datetime(df_ratings['timestamp'], unit='s').dt.strftime('%d/%m/%Y')
df_ratings = df_ratings.drop(columns=['timestamp'])

df_ratings_full = df_ratings.merge(df_movies_mod, on='filmId', how='left')
df_ratings_full = df_ratings_full.merge(df_users_mod, on='userId', how='left')
print('Filas resultantes:', df_ratings_full.shape[0])
df_ratings_full.head()

Filas resultantes: 1000209


Unnamed: 0,userId,filmId,rating,date,film,genders,year,gender,occupation,age_group_mean,age_group_letter,state
0,1,1193,5,31/12/2000,One Flew Over the Cuckoo's Nest,[Drama],1975.0,F,Estudiante (Escuela/Instituto),17,A,MI
1,1,661,3,31/12/2000,James and the Giant Peach,"[Animation, Children's, Musical]",1996.0,F,Estudiante (Escuela/Instituto),17,A,MI
2,1,914,3,31/12/2000,My Fair Lady,"[Musical, Romance]",1964.0,F,Estudiante (Escuela/Instituto),17,A,MI
3,1,3408,4,31/12/2000,Erin Brockovich,[Drama],2000.0,F,Estudiante (Escuela/Instituto),17,A,MI
4,1,2355,5,06/01/2001,"Bug's Life, A","[Animation, Children's, Comedy]",1998.0,F,Estudiante (Escuela/Instituto),17,A,MI
