In [None]:
import pandas as pd
import numpy as np
import os
import random
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from PIL import Image
import math
import plotly.express as px
import plotly.graph_objects as go
import scipy.stats as stats
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from scipy.stats import mode
from scipy.spatial.distance import cdist
from scipy.cluster.hierarchy import dendrogram, linkage

dfStart = pd.read_csv("ciudades.csv")
df=dfStart.drop(columns=["City","label","training"])
dfAmazon = pd.read_csv("amazon_product.csv")


data = df.to_numpy()

In [None]:
# 1.1. Cálculos estadísticos
media = df.mean()
mediana = df.median()
desviacion = df.std()
moda = df.mode().iloc[0]
moda_counts = df.apply(lambda x: x.value_counts().iloc[0] if x.value_counts().size > 0 else np.nan)

print("Media:\n", media)
print("Mediana:\n", mediana)
print("Desviación Estándar:\n", desviacion)
print("Moda:\n", moda)
print("Frecuencia de la Moda:\n", moda_counts)
print("\n")

In [None]:
# 1.2. Boxplot
fig, axs = plt.subplots(1, len(df.columns), figsize=(20,5))
for i, column in enumerate(df.columns):
    sns.boxplot(y=df[column], ax=axs[i])
    axs[i].set_title(column)
plt.tight_layout()
plt.show()

In [None]:
# 1.3. Covarianza

cov_matrix = df.cov()
cov_matrix

In [None]:
# 1.4. Correlación
x = df["GDP (USD Billion)"] 
y = df["Population (Millions)"] 

x_mean = x.mean()
y_mean = y.mean()

numerador = sum((x - x_mean) * (y - y_mean))
denominador = np.sqrt(sum((x - x_mean) ** 2) * sum((y - y_mean) ** 2))

correlacion_manual = numerador / denominador

correlacion_numpy = np.corrcoef(x, y)[0, 1]

print("Correlación manual:", correlacion_manual)
print("Correlación NumPy:", correlacion_numpy)

In [None]:
# 1.5. relación entre covarianza y correlación.
print("En si son dos medidas estadisticas que describen la relacion entre dos variables la covarianza mide como dos variables varian juntas pero su magnitud depende de las unidades de las variables la correlacion es una medida normalizada de la relacion lineal entre dos variables con valores entre -1 y 1")  


In [None]:
clusters = 4
centroids = {}
it = 0

k_cities = []
with open('ciudades.csv', 'r', newline='', encoding='utf-8') as file:
    reader = csv.reader(file)
    k_cities = [row for row in reader]

label_col = k_cities[0].index('label')

def normalize(data):
    num_cols = len(data[0])
    means = [0] * num_cols
    stds = [0] * num_cols

    for c in range(num_cols):
        try:
            col_values = [float(row[c]) for row in data if row[c].replace('.', '', 1).isdigit()]
            means[c] = sum(col_values) / len(col_values)
            stds[c] = (sum((x - means[c]) ** 2 for x in col_values) / (len(col_values)-1)) ** 0.5
        except:
            continue

    for row in data:
        for c in range(num_cols):
            try:
                if stds[c] != 0 and c != label_col:
                    row[c] = (float(row[c]) - means[c]) / stds[c]
            except:
                continue

    return data

k_cities = normalize(k_cities[1:])

train_cities = [row[:-1] for row in k_cities if row[-1] == "Yes"]
test_cities = [row[:-1] for row in k_cities if row[-1] == "No"]

def get_distance(a,b):
    d = 0

    for i in range(len(b)):
        if i == label_col: continue
        try:
            d += (float(b[i]) - float(a[i]))**2
        except:
            continue

    return math.sqrt(d)

def get_centroid(data):
    centroid = [0] * len(data[0])

    for row in data:
        for col in range(len(row)):
            if col == label_col: continue
            try:
                centroid[col] += float(row[col])/len(data)
            except:
                continue
    return centroid

def reassign_label(city):
    distances = {}
    swap = False

    for c in centroids:
        distances[c] = get_distance(centroids[c], city)

    closest_cluster = str(min(distances, key=distances.get))
    if city[-1] != closest_cluster:
        city[-1] = closest_cluster
        swap = True

    return swap

def calculate_centroids():
    for k in range(clusters):
        filtered = [row for row in train_cities if row[-1] == str(k)]
        centroids[k] = get_centroid(filtered)

for i in range(len(k_cities)):

    it += 1

    calculate_centroids()

    swap = False
    for city in train_cities:
        s = reassign_label(city)
        if s: swap = s

    if not swap:
        calculate_centroids()
        break
tcdf = pd.DataFrame(np.array(train_cities)[:, [0,-1]], columns = ["Ciudad","Etiqueta"])

output = [['ID','label']]

for city in test_cities:

    d0 = {}
    for c in centroids:
        d0[c] = get_distance(centroids[c], city)

    closest_cluster = min(d0, key=d0.get)

    d1 = {}
    for t in train_cities:
        d1[t[0]] = get_distance(centroids[closest_cluster], t)

    closest_city = min(d1, key=d1.get)
    output.append([city[0],closest_city])

with open('k_ciudadesCo.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    for row in output:
        writer.writerow(row)

In [None]:
#1.7 Dendrograma
data = df.to_numpy()

Z = linkage(data, method="complete")

plt.figure(figsize=(10, 5))
dendrogram(Z, labels=dfStart["City"].values, leaf_rotation=90, leaf_font_size=10)
plt.title("Dendrograma usando Complete Linkage")
plt.xlabel("Ciudades")
plt.ylabel("Distancia")
plt.show()

In [None]:
# 2.1. Calcular la matriz de covarianza
data_final = df[["GDP (USD Billion)",  "Population (Millions)"]]

result = data_final - data_final.mean()

cov_matrix = result.cov()
cov_matrix

In [None]:
# 2.2. Calcular eigenvalues
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)  # Desempaquetar correctamente
print("Eigenvalues: ", eigenvalues)
print("Eigenvalues shape:", eigenvalues.shape)
print("Eigenvalues content:\n", eigenvalues)


In [None]:
# 2.3. Varianza explicada por cada eigenvalue
# Varianza total
varianza_total = np.sum(eigenvalues)

# Varianza explicada por cada componente
varianza_explicada = eigenvalues / varianza_total

print("Varianza total:", varianza_total)
print("Varianza explicada: ")
print(varianza_explicada)

In [None]:
# 2.4. Calcular eigenvector
eigenvectors = np.linalg.eig(cov_matrix)
print("Eigenvectors: ", eigenvectors)

In [None]:
# 2.5. Proyectar los datos en la dirección del primer eigenvector (dimensión más importante)
pca = PCA(n_components=1)
projected_data = pca.fit_transform(data)
print("2.5. Matriz proyectada: ", projected_data)

In [None]:
# 2.6. Calcular la diferencia entre la matriz original y la reconstruida
reconstructed_data = pca.inverse_transform(projected_data)
error = np.linalg.norm(data - reconstructed_data)
print("\n2.6. Error de reconstruccion:\n", error)


In [None]:
# 2.7. Pintar las ciudades en 1 dimensión
plt.figure(figsize=(10, 5))
plt.scatter(projected_data, np.zeros_like(projected_data), marker="x")
for i, city in enumerate(dfStart["City"]):
    plt.text(projected_data[i], 0, city, fontsize=9, ha="right")
plt.title("Ciudades proyectadas en 1D")
plt.xlabel("Componente Principal 1")
plt.show()

In [None]:
# 2.8. Pintar en 2D
pca_2d = PCA(n_components=2)
projected_data_2d = pca_2d.fit_transform(data)

plt.figure(figsize=(12, 9))
plt.scatter(projected_data_2d[:, 0], projected_data_2d[:, 1], marker="o")
for i, city in enumerate(dfStart["City"]):
    plt.text(projected_data_2d[i, 0], projected_data_2d[i, 1], city, fontsize=9)
plt.title("Ciudades proyectadas en 2D con PCA")
plt.xlabel("Componente Principal 1")
plt.ylabel("Componente Principal 2")
plt.show()

In [None]:
#3
#3.1.
Y= df["GDP (USD Billion)"] 
X = df["Population (Millions)"] 

X_mean = sum(X) / len(X)
Y_mean = sum(Y) / len(Y)

numerator = sum((X - X_mean) * (Y - Y_mean))
denominator = sum((X - X_mean) ** 2)
b1 = numerator / denominator
b0 = Y_mean - b1 * X_mean
print(f"b0: {b0}, b1: {b1}")


#3.2.
plt.scatter(X, Y, color='blue', label='Datos')
plt.plot(X, b0 + b1 * X, color='red', label='Regresión')
plt.xlabel("Población (millones)")
plt.ylabel("PIB (USD Billones)")
plt.title("Regresión lineal de Población vs PIB")
plt.legend()
plt.show()


In [None]:
#4 PCA
train_faces = [1855,4729,3954,2886,3168,4943,2288,2872,5059,2618,3365,1432,5092,4140,1600,4372,3157,2085,1264,
                4716,3533,3701,4524,1290,2415,2627,3391,2243,4988,5066,4386,2071,2875,2049,4944,4178,3953,2881,
                1638,1852,3739,4381,3998,2076,3396,2244,5061,2620,1899,1297,2412,3706,4523,1263,4711,3534,1607,
                4375,3150,2082,3362,1435,5095,4147,4986,5068,4388,2843,3991,2629,1890,4718,1864,4972,3965,3159,
                2616,2424,2040,3192,4185,5057,2272,2888,3166,1631,4343,1403,4171,2286,3354,4515,3730,3502,1255,
                4727,1609,3962,4975,4149,3708,1863,1897,1299,2844,3996,2078,3398,4981,3505,1252,4720,4512,3737,
                1404,4176,2281,3353,3161,1636,4344,4182,5050,2275,2047,3195,2423,2611,3763,4546,4774,3551,2483,
                4310,1662,3135,3909,3307,4122,1450,1696,2013,2221,3797,2645,4780,2477,4921,3338,3936,1239,1837,
                4579,2448,2810,5209,4787,2470,3790,2642,2226,5003,1691,2014,2828,3300,4125,4919,1457,4317,1665,
                3132,4773,3556,2484,3764,4541,2817,2219,1830,2689,3569,3931,4328,4926,1468,5035,1495,2210,2022,
                5207,2446,3594,4583,2674,3560,4745,1237,4577,1839,2680,3752,4113,1461,3336,3104,3938,4321,1653,
                3799,2479,1698,2821,3907,3309,4910,4548,1806,3103,4326,1654,4114,1466,4928,3331,4570,2687,3755,
                3567,4742,1230,4584,2673,2441,3593,2025,2819,5200,5032,1492,2217,3558,1801,1459,4917,4319,3900,
                2228,2826,4789,1298,1896,3399,4980,2079,2845,3997,4148,4974,1608,3963,3709,1862,2046,3194,4183,
                5051,2274,2610,2422,4513,3736,3504,4721,1253,3160,4345,1637,4177,1405,2280,3352,1865,4719,3158,
                3964,4973,4389,2842,3990,5069,4987,2628,1891,4170,1402,2287,3355,3167,2889,4342,1630,3503,4726,
                1254,4514,3731,2425,2617,4184,5056,2273,2041,3193,3952,2880,1639,4179,4945,1853,3738,2048,2874,
                4710,1262,3535,3707,4522,3363,5094,4146,1434,4374,1606,3151,2083,3397,2245,5060,4380,2077,3999,
                1296,2413,2621,1898,5058,2873,2619,4728,1854,4942,2289,3169,3955,2887,2626,1291,2414,4387,2070,
                3390,2242,5067,4989,4373,1601,3156,2084,3364,5093,4141,1433,3700,4525,4717,1265,3532,2440,3592,
                4585,2672,1493,5033,2216,2818,2024,5201,1467,4929,4115,3330,3102,1655,4327,3566,1231,4743,4571,
                2686,3754,2827,2229,4788,1800,3559,4318,3901,1458,4916,4576,1838,2681,3753,3561,1236,4744,3939,
                3105,1652,4320,1460,4112,3337,2023,5206,1494,5034,2211,4582,2675,2447,3595,3308,4911,3906,4549,
                1807,2478,3798,1699,2820,1664,4316,3133,3301,4918,1456,4124,3765,4540,4772,3557,2485,3791,2643,
                4786,2471,1690,2829,2015,2227,5002,3568,1831,2688,4927,1469,3930,4329,2218,2816,2220,5005,1697,
                2012,4781,2476,3796,2644,4775,3550,2482,3762,1809,4547,3306,1451,4123,1663,4311,3908,3134,2449,
                2811,5208,3937,4920,3339,1836,4578,1238,1944,4638,3079,2997,3845,4852,2399,2963,5148,2709,3274,
                4051,5183,1523,4263,1711,2194,3046,4607,1375,3422,3610,4435,1381,2504,2736,2352,3280,5177,4899,
                4297,2160,2158,2964,4069,4855,2990,3842,1729,1943,3628,4290,2167,3889,2355,3287,5170,2731,1988,
                1386,2503,3617,4432,4600,1372,3425,4264,1716,2193,3041,3273,5184,1524,5179,4897,4299,3880,2952,
                2738,1981,4609,1975,4863,3048,3874,2707,2535,3083,2151,5146,4094,2363,3077,2999,4252,1720,4060,
                1512,3245,2397,4404,3621,3413,4636,1344,1718,3873,4058,4864,3619,1972,1986,1388,2169,3887,2955,
                3289,4890,3414,4631,1343,4403,3626,4067,1515,3242,2390,3070,4255,1727,5141,4093,2364,3084,2156,
                2532,2700,3672,4457,1919,1317,4665,2592,3440,1773,4201,3818,3024,3216,1541,4033,1787,2102,2330,
                5115,2754,3686,4691,2566,4830,3229,3827,1328,4468,1926,2559,2901,4696,2561,2753,3681,2337,5112,
                1780,2939,2105,3211,1546,4808,4034,1774,4206,3023,1310,4662,2595,3447,3675,4450,2906,2308,1921,
                2798,3478,3820,4239,1579,4837,1584,5124,2301,2133,3485,2557,4492,2765,3471,1326,4654,1928,4466,
                3643,2791,1570,4002,3227,3829,3015,1742,4230,3688,2568,1789,2930,3816,3218,4801,1917,4459,1319,
                3012,1745,4237,4839,1577,4005,3220,4461,3644,2796,3476,1321,4653,4495,2762,3482,2550,2908,2134,
                1583,5123,2306,3449,1910,4806,1548,4208,3811,2339,2937,4698,1389,1987,3288,4891,3886,2954,2168,
                4865,4059,1719,3872,3618,1973,3085,2157,5140,4092,2365,2701,2533,4402,3627,3415,1342,4630,3071,
                1726,4254,1514,4066,3243,2391,1974,4608,3875,3049,4862,4298,3881,2953,4896,5178,2739,1980,1513,
                4061,3244,2396,2998,3076,1721,4253,3412,1345,4637,4405,3620,2534,2706,5147,4095,2362,3082,2150,
                2991,3843,1728,4854,4068,1942,3629,2965,2159,1373,4601,3424,3616,4433,3272,1525,4057,5185,1717,
                4265,2192,3040,2354,3286,5171,4291,3888,2166,1387,2502,2730,1989,5149,2962,2708,4639,1945,4853,
                2398,2996,3844,3078,2737,1380,2505,4296,2161,2353,3281,4898,5176,1710,4262,2195,3047,3275,1522,
                4050,5182,3611,4434,1374,4606,3423,3483,2551,4494,2763,5122,1582,2307,2135,2909,4004,4838,1576,
                3221,3013,4236,1744,3477,4652,1320,4460,3645,2797,2936,2338,4699,1911,3448,4209,3810,4807,1549,
                1929,4467,3642,2790,3470,4655,1327,3014,3828,4231,1743,4003,1571,3226,2132,5125,1585,2300,4493,
                2764,3484,2556,3219,4800,3817,1318,1916,4458,2569,3689,1788,2931,4207,1775,3022,3210,4035,1547,
                4809,3674,4451,4663,1311,2594,3446,2752,3680,4697,2560,1781,2104,2938,2336,5113,3479,1920,2799,
                1578,4836,3821,4238,2309,2907,2331,5114,1786,2103,4690,2567,2755,3687,4664,1316,2593,3441,3673,
                4456,1918,3217,4032,1540,4200,1772,3025,3819,2558,2900,3826,4831,3228,4469,1927,1329,5109,2922,
                2748,4679,1905,4813,3038,3804,2777,4480,3497,2545,2121,2313,5136,1596,4222,1750,3007,3235,4010,
                1562,3651,2783,4474,4646,1334,3463,3803,1768,4028,4814,1902,3669,2589,2119,2925,4641,1333,3464,
                3656,2784,4473,3232,4017,1565,4225,1757,3000,2314,5131,1591,2126,3490,2542,2770,4487,1934,4648,
                3009,3835,4822,2913,5138,1598,2779,3499,4021,1553,3204,3036,4213,1761,2580,3452,4677,1305,4445,
                3660,2574,4683,2746,3694,5107,2322,2110,1795,4489,2128,2914,4019,4825,1759,3832,3658,1933,2117,
                1792,5100,2325,2741,3693,2573,4684,4442,3667,2587,3455,4670,1302,3031,4214,1766,4026,1554,3203,
                2371,5154,4086,3091,2143,2527,2715,1356,4624,3401,3633,4416,1958,3257,2385,1500,4072,1732,4240,
                3859,3065,1993,2518,3892,2940,4885,3866,2188,4871,3268,4429,1967,1369,1735,4247,3062,3250,2382,
                1507,4849,4075,3634,4411,1351,4623,3406,2712,2520,2978,3096,2144,2376,5153,4081,3439,1960,1538,
                4876,5198,3861,4278,4882,2349,3895,2947,1994,1969,4427,3602,3430,1367,4615,3868,2186,3054,1703,
                4271,1531,4043,5191,3266,2172,4285,5165,2340,3292,2724,2516,1393,3259,4840,2985,3857,1358,1956,
                4418,2529,4088,2971,2511,1394,2723,5162,2347,3295,2949,2175,4282,4878,1536,4044,5196,3261,2181,
                3053,1704,4276,3437,1360,4612,4420,3605,2976,3098,2378,1951,3408,4249,2982,3850,4847,1509,1758,
                3833,4824,4018,3659,1932,4488,2915,2129,2586,3454,1303,4671,4443,3666,1555,4027,3202,3030,1767,
                4215,5101,2324,2116,1793,2572,4685,2740,3692,1599,5139,2912,3498,2778,4649,1935,4823,3834,3008,
                2747,3695,2575,4682,2111,1794,5106,2323]

test_faces = [3037,1760,4212,1552,4020,3205,4444,3661,2581,3453,1304,4676,2924,2118,4815,4029,3802,1769,2588,
              1903,3668,2127,2315,1590,5130,2771,4486,3491,2543,3657,2785,4472,1332,4640,3465,1756,4224,3001,3233,
              1564,4016,1904,4678,3805,3039,4812,2923,5108,2749,3234,1563,4011,1751,4223,3006,1335,4647,3462,3650,
              2782,4475,3496,2544,2776,4481,2312,1597,5137,2120,2180,3052,4277,1705,4045,5197,4879,1537,3260,4421,
              3604,3436,4613,1361,2722,2510,1395,2174,2948,4283,5163,2346,3294,3409,1950,4846,1508,4248,2983,3851,
              2379,3099,2977,5164,2341,3293,2173,4284,2517,1392,2725,3431,4614,1366,1968,4426,3603,4042,5190,1530,
              3267,2187,3055,3869,4270,1702,2528,2970,4089,2984,3856,3258,4841,1957,4419,1359,2521,2713,2377,5152,
              4080,3097,2145,2979,3251,2383,4074,1506,4848,4246,1734,3063,4622,1350,3407,3635,4410,3894,2946,4883,
              2348,1995,1961,3438,3860,4279,5199,1539,4877,3632,4417,1959,4625,1357,3400,4241,1733,3064,3858,3256,
              2384,4073,1501,3090,2142,2370,5155,4087,2714,2526,4870,3269,2189,3867,1368,4428,1966,2519,1992,4884,
              3893,2941,5018,2833,2659,4768,1814,4902,3915,3129,2666,4591,2454,3586,5215,2030,2202,1487,5027,1641,
              4333,3116,3324,1473,4101,2692,3740,4565,1225,4757,3572,3912,1679,4905,4139,1813,3778,2498,2834,2008,
              4750,3575,2695,3747,4562,3323,1474,4106,1646,4334,3111,2205,1480,5020,5212,2037,2453,3581,2661,4596,
              1825,4759,3924,3118,4933,2802,1489,5029,2668,3588,1442,4130,3315,3127,1670,4302,3543,2491,4766,4554,
              3771,2465,4792,3785,2657,5016,2233,2001,1684,4598,2805,2039,4934,4108,1648,3923,3749,1822,2006,1683,
              5011]

In [None]:
#4.1.
def load_images(faces):
    X = []

    for face in faces:
        with open(os.path.join("faces/",str(face)), "rb") as f:
            bytes_read = f.read()
            img = Image.frombytes('L', (128,128), bytes_read)
            X.append(np.array(img).flatten())

    return np.array(X)

def plot_image(data, label="Face", ax=None):
    fn_shape = lambda X: X.reshape(128,128)
    fig = None
    if ax is None:
        fig, ax = plt.subplots(1,1, constrained_layout=True)
    ax.imshow(fn_shape(data))
    ax.set_title(label=label)
    return fig,ax

Tr = load_images(train_faces)

Tr_mean = np.mean(Tr, axis=0)
Tr_center = Tr - Tr_mean

plot_image(Tr_mean, label="Average face")

In [None]:
#4.2.
pca_model = PCA(n_components=0.99)
projected_data = pca_model.fit_transform(Tr_center)
cumulative_variance = np.cumsum(pca_model.explained_variance_ratio_)

components_95 = np.argmax(cumulative_variance >= 0.95) + 1
components_99 = np.argmax(cumulative_variance >= 0.99) + 1

principal_components = pca_model.components_
components_to_visualize = [1, 3, 20, components_95, components_99]

fig, axes = plt.subplots(5, 6, figsize=(18, 18))

for row in range(5):
    plot_image(Tr[row], label=f"Imagen {row+1} (Original)", ax=axes[row, 0])
    for col in range(5):
        reconstructed_image = Tr_mean + np.sum(np.dot(projected_data[row].reshape(1, -1)[:, :components_to_visualize[col]], 
                                                      principal_components[:components_to_visualize[col], :]), axis=0)
        plot_image(reconstructed_image, ax=axes[row, col+1], label=f"Imagen {row+1} ({components_to_visualize[col]} Componentes)")

In [None]:
test_images = load_images(test_faces)

test_images_centered = test_images - Tr_mean
projected_test = np.dot(test_images_centered, pca_model.components_.T)


first_component_values = projected_test[:, 0]

fig, axes = plt.subplots(1, 5, figsize=(15, 3))

for index in range(5):
    random_index = random.randint(0, len(test_faces) - 1)
    reconstructed_img = Tr_mean + np.sum(np.dot(projected_test[random_index].reshape(1, -1)[:, :components_99], 
                                                principal_components[:components_99, :]), axis=0)
    plot_image(reconstructed_img, ax=axes[index], label=f"Imagen {random_index+1} ({components_99} Componentes)")

plt.tight_layout()
plt.show()

results_df = pd.DataFrame({
    'id': test_faces,
    'label': first_component_values
})

results_df.to_csv('PCA_caras.csv', index=False)

In [None]:
# 5.1 Distribución de variables

null = ["product_title", "product_availability", "delivery", "asin", "product_url", "product_photo"]
numerico = ["product_minimum_offer_price", "sales_volume", "product_price", "unit_count", "product_star_rating", "unit_price", "product_num_ratings", "product_original_price", "product_num_offers"]
categorico = ["is_best_seller", "currency", "has_variations", "is_amazon_choice", "climate_pledge_friendly", "is_prime"]

# 5.1.1 Gráficos de barras para variables categóricas
for cat in categorico:
    if cat in dfAmazon.columns:
        dfAmazon[cat] = dfAmazon[cat].astype(str) 

        # Contar las ocurrencias de cada categoría
        counts = dfAmazon[cat].value_counts().reset_index()
        counts.columns = [cat, 'count']

        # Graficar
        fig = px.bar(counts, x=cat, y='count', title=f'Distribución de {cat}')
        fig.show()


In [None]:
# 5.1.2 Histogramas para variables numéricas
for col in numerico:
    fig = px.histogram(dfAmazon, x=col, title=f"Histograma de {col}", nbins=30)
    fig.show()



In [None]:
# 5.2 Relación con sales_volume
dfAmazon['sales_volume'] = pd.to_numeric(dfAmazon['sales_volume'], errors='coerce')

In [None]:
# 5.2.1 Boxplots para variables categóricas
for cat in categorico:
    fig = px.box(dfAmazon, x=cat, y='sales_volume', title=f'Boxplot de {cat} vs Sales Volume')
    fig.show()

In [None]:
# 5.2.2 Scatter plots para variables numéricas
for col in numerico:
    if col != 'sales_volume':
        fig = px.scatter(dfAmazon, x=col, y='sales_volume',title=f'{col} vs sales_volume',labels={col: col, 'sales_volume': 'sales_volume'})
        fig.show()

In [None]:
#5.3.1. Matriz de correlación
def clean_numeric(column):
    return pd.to_numeric(column.astype(str).str.replace(r'[^0-9\.-]', '', regex=True), errors='coerce')

for num in numerico:
    if num in dfAmazon.columns:
        dfAmazon[num] = clean_numeric(dfAmazon[num])
numeric_df = dfAmazon[numerico]
corr_matrix = numeric_df.corr()

fig = px.imshow(
    corr_matrix, 
    title='Matriz de Correlación', 
    labels=dict(color="Correlación"),
    color_continuous_scale="viridis",
    text_auto=True
)
fig.show()

In [None]:
#Si el coeficiente es positivo, es porque se da una relación proporcional (si aumenta X, aumenta Y). Si es negativo, se da una relación inversamente proporcional (si aumenta X, disminuye Y).

In [None]:
# 5.3.2 Crear dummy variables y generar nueva matriz de correlación
dfAmazon_dummies =pd.get_dummies(dfAmazon[categorico].drop(columns=["currency"]))
corr_matrix_dummies = dfAmazon_dummies.corr()
fig = px.imshow(corr_matrix_dummies, title='Matriz de Correlación con Dummies')
fig.show()



In [None]:
#esta entre has_variations (Tiene variaciones) y climate_pledge_friendly (Comprometido con el medio ambiente)

In [None]:
# 5.3.3 Imputar valores nulos con la media
dfAmazon[numerico] = dfAmazon[numerico].apply(lambda x: x.fillna(x.mean()))


X = dfAmazon[['product_num_ratings']]
y = dfAmazon['product_star_rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
b0 = model.intercept_
b1 = model.coef_[0]

print(f'Regresión lineal: y = {b0:.4f} + {b1:.4f} * x')