In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.impute import SimpleImputer

#2.1
file_name = "ejemplo_data.csv"
df = pd.read_csv(file_name)

print("Tipos iniciales de variables:")
print(df.info())

df['ID'] = df['ID'].astype(int)
df['Activo'] = df['Activo'].astype(bool)

print("\nTipos de variables después de transformar 'ID' y 'Activo':")
print(df.info())

df['unidades'] = df['unidades'].astype(int)
df['2016'] = df['2016'].astype(float)

print("\nTipos de variables después de transformar 'unidades' y '2016':")
print(df.info())

#2.2
file_name = "ecommerce_data.csv"
df = pd.read_csv(file_name)

print("Tipos iniciales de variables:")
print(df.info())

df['InvoiceNo'] = df['InvoiceNo'].astype(int)
df['Description'] = df['Description'].astype(str)

print("\nTipos de variables después de transformar 'InvoiceNo' y 'Description':")
print(df.info())

df['Quantity'] = df['Quantity'].astype(int)
df['UnitPrice'] = df['UnitPrice'].astype(float)

df[['Date', 'Time']] = df['InvoiceDate'].str.split(' ', expand=True)

df['TotalAmount'] = df['Quantity'] * df['UnitPrice']

processed_file_name = "processed_ecommerce_data.csv"
df.to_csv(processed_file_name, index=False)
print(f"\nBase de datos procesada exportada como {processed_file_name}")

grouped_data = df.groupby('InvoiceNo')['TotalAmount'].sum()
print("\nTotal por factura:\n", grouped_data)

df_sorted = df.sort_values(by='TotalAmount', ascending=False)
print("\nDatos ordenados por monto total:\n", df_sorted.head())

df_indexed = df.set_index('InvoiceNo')
print("\nDatos con 'InvoiceNo' como índice:\n", df_indexed.head())

sample_data = df.sample(n=5)
print("\nMuestra aleatoria de 5 filas:\n", sample_data)

df_pivot = df.pivot_table(values='TotalAmount', index='Date', columns='InvoiceNo', aggfunc='sum')
print("\nTabla pivote:\n", df_pivot.head())

df_reset = df_indexed.reset_index()
print("\nDatos con índice reseteado:\n", df_reset.head())

additional_data = pd.DataFrame({
    'InvoiceNo': [123456, 123457],
    'AdditionalInfo': ['Info1', 'Info2']
})
df_merged = pd.merge(df, additional_data, on='InvoiceNo', how='left')
print("\nDatos fusionados:\n", df_merged.head())

#2.3
data = {
    'Atributo1': [random.uniform(1, 100) for _ in range(50)],
    'Atributo2': [random.uniform(200, 300) for _ in range(50)],
    'Atributo3': [random.uniform(500, 1000) for _ in range(50)],
    'Categoria': [random.choice(['A', 'B', 'C']) for _ in range(50)]
}

df = pd.DataFrame(data)

print("Estadísticas descriptivas de tendencia central:")
print(df[['Atributo1', 'Atributo2', 'Atributo3']].mean())
print(df[['Atributo1', 'Atributo2', 'Atributo3']].median())

print("\nEstadísticas descriptivas de dispersión:")
print(df[['Atributo1', 'Atributo2', 'Atributo3']].std())
print(df[['Atributo1', 'Atributo2', 'Atributo3']].var())

#2.4
ratings_file = "ratings_data.csv"
books_file = "books_data.csv"
ratings_df = pd.read_csv(ratings_file)
books_df = pd.read_csv(books_file)

print("\nDiagnóstico de valores perdidos en ratings data:")
print(ratings_df.isnull().sum())

imputer_mean = SimpleImputer(strategy='mean')
ratings_df.iloc[:, :] = imputer_mean.fit_transform(ratings_df)

ratings_df.fillna(value=-1, inplace=True)
print("\nValores después de imputación:")
print(ratings_df.head())

ratings_df['ISBN_avg'] = ratings_df.groupby('ISBN')['Rating'].transform('mean')

consolidated_df = pd.merge(ratings_df, books_df, on='ISBN', how='left')

consolidated_file = "consolidated_data.csv"
consolidated_df.to_csv(consolidated_file, index=False)
print(f"\nBase de datos consolidada exportada como {consolidated_file}")


KeyboardInterrupt: 