**Pre-Procesamiento 1:**
**Unir los csv de categorias filtradas por webscrapping**

In [1]:
import pandas as pd

# Cargar los archivos CSV
gaming_data = pd.read_csv('To_Merge_CSV/Gaming_data.csv')
ia_data = pd.read_csv('To_Merge_CSV/IA_data.csv')
meme_data = pd.read_csv('To_Merge_CSV/Meme_data.csv')
rwa_data = pd.read_csv('To_Merge_CSV/RWA_data.csv')

# Agregar la columna 'Class' a cada dataset
gaming_data['Class'] = 1  # Gaming
ia_data['Class'] = 0  # IA
meme_data['Class'] = 3  # Meme
rwa_data['Class'] = 2  # RWA

# Unir los datasets
combined_data = pd.concat([ia_data, gaming_data, rwa_data, meme_data], ignore_index=True)

# Mostrar las primeras filas para verificar
combined_data.head()




Unnamed: 0,Ranking,Name,Token,Price,1h%,24h%,7d%,Market Cap,Volume(24h),Circulating Supply,Class
0,17,NEAR Protocol,NEAR,$5.17,1.73%,13.25%,32.87%,"$5,851,379,335","$823,190,009\n159,325,316 NEAR","1,132,512,364 NEAR",0
1,22,Internet Computer,ICP,$8.66,0.69%,3.70%,8.80%,"$4,081,234,778","$110,144,715\n12,734,250 ICP","471,226,357 ICP",0
2,24,Artificial Superintelligence Alliance,FET,$1.62,0.01%,1.67%,28.22%,"$4,073,335,575","$327,675,519\n203,106,658 FET","2,520,000,000 FET",0
3,27,Bittensor,TAO,$526.48,0.25%,5.25%,82.96%,"$3,885,929,863","$335,896,682\n638,415 TAO","7,380,936 TAO",0
4,30,Render,RENDER,$6.02,0.71%,3.21%,27.62%,"$3,115,792,065","$495,356,599\n82,267,216 RENDER","517,460,631 RENDER",0


**Leer datos historicos de los halving + 250 dias y guardar los datos necesarios**

In [3]:
import pandas as pd
import ast
import os

# Columnas a conservar
columns_to_keep = ['name', 'symbol', 'cmcRank', 'circulatingSupply', 'totalSupply', 'maxSupply', 'quotes','dateAdded']

# Función para cargar y procesar un archivo CSV
def process_halving_file(file_name, columns_to_keep):
    data = pd.read_csv(file_name)[columns_to_keep]
    usd_data = data['quotes'].apply(extract_usd_data)
    data_clean = pd.concat([data, usd_data], axis=1)
    data_clean.drop(columns=['quotes'], inplace=True)

    return data_clean

# Función para extraer los datos en USD de la columna 'quotes'
def extract_usd_data(quotes):
    try:
        # Convierte el string de 'quotes' en una lista de diccionarios
        quotes_list = ast.literal_eval(quotes)

        # Extrae solo el primer diccionario, que corresponde a USD
        usd_data = quotes_list[0]
        return pd.Series({
            'price': usd_data.get('price', None),
            'volume24h': usd_data.get('volume24h', None),
            'marketCap': usd_data.get('marketCap', None),
            'percentChange1h': usd_data.get('percentChange1h', None),
            'percentChange24h': usd_data.get('percentChange24h', None),
            'percentChange7d': usd_data.get('percentChange7d', None),
        })
    except (ValueError, IndexError):
        return pd.Series({
            'price': None,
            'volume24h': None,
            'marketCap': None,
            'percentChange1h': None,
            'percentChange24h': None,
            'percentChange7d': None,
        })

# Lista de archivos CSV de halving
halving_files = [
    'Scrapped/halving_1.csv',
    'Scrapped/halving_1_plus250.csv',
    'Scrapped/halving_2.csv',
    'Scrapped/halving_2_plus250.csv',
    'Scrapped/halving_3.csv',
    'Scrapped/halving_3_plus250.csv',
    'Scrapped/halving_4.csv'
]

# Crear la carpeta "Clean" en el nivel superior
output_dir = 'Clean'
os.makedirs(output_dir, exist_ok=True)

# Procesar cada archivo CSV y almacenar el resultado en una lista de dataframes
processed_data = [process_halving_file(file, columns_to_keep) for file in halving_files]

# Guardar cada dataframe procesado en la carpeta "Clean"
for i, file in enumerate(halving_files):
    output_file = os.path.join(output_dir, os.path.basename(file).replace('.csv', '_clean.csv'))
    processed_data[i].to_csv(output_file, index=False)

# Mostrar las primeras filas del primer archivo procesado
print(processed_data[0].head())


        name symbol  cmcRank  circulatingSupply  totalSupply   maxSupply  \
0    Bitcoin    BTC        1        11091325.00  11091325.00  21000000.0   
1   Litecoin    LTC        2        17164230.00  17164230.00  84000000.0   
2   Peercoin    PPC        3        18757362.00  18757362.00         NaN   
3   Namecoin    NMC        4         5415300.00   5415300.00         NaN   
4  Terracoin    TRC        5         2323569.75   2323569.75  42000000.0   

                  dateAdded       price  volume24h     marketCap  \
0  2013-04-28T00:00:00.000Z  134.210022        0.0  1.488567e+09   
1  2013-04-28T00:00:00.000Z    4.348405        0.0  7.463702e+07   
2  2013-04-28T00:00:00.000Z    0.386525        0.0  7.250187e+06   
3  2013-04-28T00:00:00.000Z    1.107233        0.0  5.995997e+06   
4  2013-04-28T00:00:00.000Z    0.646892        0.0  1.503099e+06   

   percentChange1h  percentChange24h  percentChange7d  
0         0.639231               NaN              NaN  
1         0.799273    

**Combinar datos del halving 1, 2, 3 y 4 al dataset de token por categorias mediante un index que representa el Ranking en el dataset del halving correspondiente (RankIndex)**

In [4]:
import pandas as pd

# Cargar los archivos CSV de halving
halving_files = {
    'halving_1': pd.read_csv('Clean/halving_1_clean.csv'),
    'halving_1_plus250': pd.read_csv('Clean/halving_1_plus250_clean.csv'),
    'halving_2': pd.read_csv('Clean/halving_2_clean.csv'),
    'halving_2_plus250': pd.read_csv('Clean/halving_2_plus250_clean.csv'),
    'halving_3': pd.read_csv('Clean/halving_3_clean.csv'),
    'halving_3_plus250': pd.read_csv('Clean/halving_3_plus250_clean.csv'),
    'halving_4': pd.read_csv('Clean/halving_4_clean.csv')
}

# Agregar columnas de ranking para cada archivo de halving
for name, df in halving_files.items():
    # Usamos el método merge para encontrar el ranking basado en 'Name'
    combined_data = combined_data.merge(
        df[['name', 'cmcRank']],  # Seleccionar las columnas que necesitamos
        how='left',               # Unir con los datos de combined_data
        left_on='Name',           # Coincidir con la columna Name
        right_on='name',          # Coincidir con la columna name
        suffixes=('', f'_{name}') # Evitar conflictos de nombres
    )

    # Renombrar la columna 'cmcRank' para reflejar el archivo de halving
    combined_data.rename(columns={'cmcRank': f'{name}(RankIndex)'}, inplace=True)

    # Eliminar la columna 'name' que no queremos mantener
    combined_data.drop(columns=['name'], inplace=True)

# Mostrar las primeras filas para verificar
print(combined_data.head())


   Ranking                                   Name   Token    Price    1h%  \
0       17                          NEAR Protocol    NEAR    $5.17  1.73%   
1       22                      Internet Computer     ICP    $8.66  0.69%   
2       24  Artificial Superintelligence Alliance     FET    $1.62  0.01%   
3       27                              Bittensor     TAO  $526.48  0.25%   
4       30                                 Render  RENDER    $6.02  0.71%   

     24h%     7d%      Market Cap                      Volume(24h)  \
0  13.25%  32.87%  $5,851,379,335   $823,190,009\n159,325,316 NEAR   
1   3.70%   8.80%  $4,081,234,778     $110,144,715\n12,734,250 ICP   
2   1.67%  28.22%  $4,073,335,575    $327,675,519\n203,106,658 FET   
3   5.25%  82.96%  $3,885,929,863        $335,896,682\n638,415 TAO   
4   3.21%  27.62%  $3,115,792,065  $495,356,599\n82,267,216 RENDER   

   Circulating Supply  Class  halving_1(RankIndex)  \
0  1,132,512,364 NEAR      0                   NaN   
1     47

**Agregar información de los suministros de cada Token (Max_Supply), con los datos que nos proporciona el API de CoinMarketCap**

In [5]:
import requests
import pandas as pd
import time

#Usando la api de CoinMarket completamos los datos
API_KEY = '74ed5d5c-85df-44bb-8e8a-7751e2f92b0f'

def fetch_cryptocurrencies(start, limit):
    url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest'
    headers = {
        'Accepts': 'application/json',
        'X-CMC_PRO_API_KEY': API_KEY,
    }

    params = {
        'start': start,
        'limit': limit,
        'convert': 'USD'
    }

    response = requests.get(url, headers=headers, params=params)
    data = response.json()

    if 'data' in data:
        return data['data']
    else:
        print("Error fetching data:", data)
        return []

def get_supply_data():
    supply_data = []
    for i in range(2):
        start = i * 5000 + 1
        limit = 5000
        print(f"Fetching cryptocurrencies {start} to {start + limit - 1}...")

        cryptocurrencies = fetch_cryptocurrencies(start, limit)
        for crypto in cryptocurrencies:
            supply_data.append({
                'name': crypto['name'],
                'total_supply': crypto.get('total_supply', None),
                'max_supply': crypto.get('max_supply', None)
            })

        time.sleep(1)

    return supply_data

def add_supply_to_combined_data(combined_file):
    combined_data= combined_file
    supply_data = get_supply_data()

    supply_df = pd.DataFrame(supply_data)
    supply_df.rename(columns={'name': 'Name'}, inplace=True)
    supply_df.drop_duplicates(subset='Name', inplace=True)

    combined_data.drop_duplicates(subset='Name', inplace=True)
    combined_data = combined_data.merge(
        supply_df[['Name', 'total_supply', 'max_supply']],
        on='Name',
        how='left'
    )

    print(combined_data.head())
    combined_data.to_csv('BasicInformationByTokenClass.csv', index=False)

if __name__ == '__main__':
    add_supply_to_combined_data(combined_data)


Fetching cryptocurrencies 1 to 5000...
Fetching cryptocurrencies 5001 to 10000...
   Ranking                                   Name   Token    Price    1h%  \
0       17                          NEAR Protocol    NEAR    $5.17  1.73%   
1       22                      Internet Computer     ICP    $8.66  0.69%   
2       24  Artificial Superintelligence Alliance     FET    $1.62  0.01%   
3       27                              Bittensor     TAO  $526.48  0.25%   
4       30                                 Render  RENDER    $6.02  0.71%   

     24h%     7d%      Market Cap                      Volume(24h)  \
0  13.25%  32.87%  $5,851,379,335   $823,190,009\n159,325,316 NEAR   
1   3.70%   8.80%  $4,081,234,778     $110,144,715\n12,734,250 ICP   
2   1.67%  28.22%  $4,073,335,575    $327,675,519\n203,106,658 FET   
3   5.25%  82.96%  $3,885,929,863        $335,896,682\n638,415 TAO   
4   3.21%  27.62%  $3,115,792,065  $495,356,599\n82,267,216 RENDER   

   Circulating Supply  Class  halv