# Inicial analysis

In [7]:
import pandas as pd

# Load the dataset
file_path = '../data/Cell_Phones_&_Accessories.txt.gz'

try:
    df = pd.read_csv(file_path, compression='gzip', sep='\t')
    # Display the first few rows of the dataframe
    print(df.head())
except FileNotFoundError:
    print("File not found. Please ensure the correct file path.")
except Exception as e:
    print(f"An error occurred: {e}")


                       product/productId: B000JVER7W
0  product/title: Mobile Action MA730 Handset Man...
1                             product/price: unknown
2                      review/userId: A1RXYH9ROBAKEZ
3                        review/profileName: A. Igoe
4                            review/helpfulness: 0/0


## Adjust to Data Format

The data is stored as separate lines, with the format `category_name: contained_value`. Each line represents a part of the review, such as the product title, price, or user information.

To make the data more usable, we need to process the file and extract the key-value pairs, grouping them into a structured format (such as a DataFrame) for easier analysis. This includes:
- Parsing each line into key-value pairs.
- Organizing related data (e.g., product information, user reviews) into individual records.
- Converting the data into a format that can be easily analyzed with Python tools like pandas.

The following code performs these steps by reading the file, extracting relevant information, and converting it into a structured DataFrame.


In [8]:
import pandas as pd
import gzip

# Load the dataset
file_path = '../data/Cell_Phones_&_Accessories.txt.gz'

try:
    # Wczytujemy plik .gz
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        lines = f.readlines()

    # Split each line by ': ' and convert to a dictionary of product attributes
    data = []
    current_review = {}

    for line in lines:
        line = line.strip()
        
        # Sprawdzamy, czy linia zawiera dwie części (klucz i wartość)
        if ": " in line:
            key, value = line.split(": ", 1)
            if key.startswith("product"):
                if current_review:
                    data.append(current_review)  # Append the previous review if it's complete
                current_review = {key: value}  # Start new review
            else:
                current_review[key] = value  # Add additional data to the current review
    
    if current_review:
        data.append(current_review)  # Append the last review if any

    # Convert to DataFrame
    df = pd.DataFrame(data)
    # Display the first few rows
    print(df.head())

except FileNotFoundError:
    print("File not found. Please ensure the correct file path.")
except Exception as e:
    print(f"An error occurred: {e}")


  product/productId                                      product/title  \
0        B000JVER7W                                                NaN   
1               NaN  Mobile Action MA730 Handset Manager - Bluetoot...   
2               NaN                                                NaN   
3        B000JVER7W                                                NaN   
4               NaN  Mobile Action MA730 Handset Manager - Bluetoot...   

  product/price   review/userId review/profileName review/helpfulness  \
0           NaN             NaN                NaN                NaN   
1           NaN             NaN                NaN                NaN   
2       unknown  A1RXYH9ROBAKEZ            A. Igoe                0/0   
3           NaN             NaN                NaN                NaN   
4           NaN             NaN                NaN                NaN   

  review/score review/time review/summary  \
0          NaN         NaN            NaN   
1          NaN         NaN

In [9]:
# Statistics for missing values and 'unknown'
nan_counts = df.isna().sum()  # Count NaN in each column
unknown_counts = (df == 'unknown').sum()  # Count occurrences of 'unknown' in each column

# General information about the data
total_rows = len(df)
total_columns = len(df.columns)

# Display results
print("Number of rows in the dataset:", total_rows)
print("Number of columns in the dataset:", total_columns)

print("\nStatistics for NaN values:")
print(nan_counts)

print("\nStatistics for 'unknown' values:")
print(unknown_counts)

# Percentage of missing data for each column
nan_percentage = (nan_counts / total_rows) * 100
print("\nPercentage of missing data for each column:")
print(nan_percentage)


Number of rows in the dataset: 236243
Number of columns in the dataset: 10

Statistics for NaN values:
product/productId     157313
product/title         157860
product/price         157313
review/userId         157313
review/profileName    157313
review/helpfulness    157313
review/score          157313
review/time           157313
review/summary        157315
review/text           157313
dtype: int64

Statistics for 'unknown' values:
product/productId         0
product/title             0
product/price         43287
review/userId          2276
review/profileName     2276
review/helpfulness        0
review/score              0
review/time               0
review/summary            0
review/text               0
dtype: int64

Percentage of missing data for each column:
product/productId     66.589486
product/title         66.821028
product/price         66.589486
review/userId         66.589486
review/profileName    66.589486
review/helpfulness    66.589486
review/score          66.58948

In [10]:
print("Examples where 'review/text' is NaN:")
print(df[df['review/text'].isna()].head(5))

print("\nExamples where 'product/price' is 'unknown':")
print(df[df['product/price'] == 'unknown'].head(5))

print("\nExamples with multiple NaNs in a row:")
print(df[df.isna().sum(axis=1) > 7].head(5))


Examples where 'review/text' is NaN:
  product/productId                                      product/title  \
0        B000JVER7W                                                NaN   
1               NaN  Mobile Action MA730 Handset Manager - Bluetoot...   
3        B000JVER7W                                                NaN   
4               NaN  Mobile Action MA730 Handset Manager - Bluetoot...   
6        B000JVER7W                                                NaN   

  product/price review/userId review/profileName review/helpfulness  \
0           NaN           NaN                NaN                NaN   
1           NaN           NaN                NaN                NaN   
3           NaN           NaN                NaN                NaN   
4           NaN           NaN                NaN                NaN   
6           NaN           NaN                NaN                NaN   

  review/score review/time review/summary review/text  
0          NaN         NaN         