## Feature Extraction

In [19]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

def process(df):
    # This is where you can do all your processing
    
    df['Helpfulness'] = df['HelpfulnessNumerator'] / df['HelpfulnessDenominator']
    df['Helpfulness'] = df['Helpfulness'].fillna(0)
    df['ReviewLength'] = df.apply(lambda row : len(row['Text'].split()) if type(row['Text']) == str else 0, axis = 1)
    
    # Add sentiment analysis using TextBlob
    df['Summary_sentiment_polarity'] = df['Summary'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    df['Summary_sentiment_subjectivity'] = df['Summary'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)
    df['Text_sentiment_polarity'] = df['Text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    df['Text_sentiment_subjectivity'] = df['Text'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)
    
    scaler = MinMaxScaler()
    df['ReviewLength_normalized'] = scaler.fit_transform(df[['ReviewLength']])
    df['Time_normalized'] = scaler.fit_transform(df[['Time']])
    df['Summary_sentiment_polarity_normalized'] = scaler.fit_transform(df[['Summary_sentiment_polarity']])
    df['Text_sentiment_polarity_normalized'] = scaler.fit_transform(df[['Text_sentiment_polarity']])

    # Create a new column with the number of reviews of the ProductId
#     df["num_product_reviews"] = df.groupby("ProductId")["Id"].transform("count")
    
    # Create a new column with the mean socre of the ProductId
    df['avg_score'] = df.groupby('ProductId')['Score'].transform('mean')

    # Create a new column with the mean socre of the ProductId times the number of reviews of the ProductId
#     df['num_product_reviews_X_avg_score'] = df['num_product_reviews'] * df['avg_score']

    
    return df

## Creating your model

In [20]:
# Load the dataset
trainingSet = pd.read_csv("./data/train.csv")

# Process the DataFrame
train_processed = process(trainingSet)

# Load test set
submissionSet = pd.read_csv("./data/test.csv")

# Merge on Id so that the test set can have feature columns as well
testX= pd.merge(train_processed, submissionSet, left_on='Id', right_on='Id')
testX = testX.drop(columns=['Score_x'])
testX = testX.rename(columns={'Score_y': 'Score'})

# The training set is where the score is not null
# This line creates a new DataFrame called trainX by selecting only the rows in train_processed 
#     where the "Score" column is not null. 
# This assumes that the "Score" column is the target variable that we want to predict.
trainX =  train_processed[train_processed['Score'].notnull()]

# Save the datasets with the new features for easy access later
testX.to_csv("./test_data/X_test.csv", index=False)
trainX.to_csv("./test_data/X_train.csv", index=False)
# print(testX.head())
print()
print(trainX.head())


        Id   ProductId          UserId  HelpfulnessNumerator  \
0  1049849  B000MR9D5E  A1EKSETIBS9ETQ                     0   
1   999834  B000GAKFIG   AR0HFYHYHDGQQ                     2   
2   218826  6300215776  A37S3ACL57LN62                    11   
3   796384  B00019071C  A1TO1P3NV7OAU6                     2   
4  1219784  B001NFNFN0   ATCM1W7HWIC6U                     0   

   HelpfulnessDenominator        Time  \
0                       0  1198281600   
1                       5  1245024000   
2                      15  1126137600   
3                       2  1351036800   
4                       0  1381708800   

                                             Summary  \
0  Great nature series, but not all scenes looked...   
1                 Agatha Christie's Marple: Series 2   
2                             Childish Entertainment   
3                       The weakest Babylon 5 season   
4                            Versatile and effective   

                              

In [16]:
# check code
# ProductId that has the biggest values in column "num_product_reviews"
print(trainX.groupby('ProductId')['Id'].count().idxmax())

B001KVZ6HK


In [21]:
trainX = trainX.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary'])
print(trainX.head())

   HelpfulnessNumerator  HelpfulnessDenominator        Time  Score  \
0                     0                       0  1198281600    4.0   
1                     2                       5  1245024000    5.0   
2                    11                      15  1126137600    2.0   
3                     2                       2  1351036800    4.0   
4                     0                       0  1381708800    5.0   

   Helpfulness  ReviewLength  Summary_sentiment_polarity  \
0     0.000000           165                         0.8   
1     0.400000            77                         0.0   
2     0.733333           222                        -0.2   
3     1.000000           110                         0.0   
4     0.000000           190                         0.6   

   Summary_sentiment_subjectivity  Text_sentiment_polarity  \
0                            0.75                 0.186411   
1                            0.00                 0.040000   
2                            0.8

In [1]:
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
from sklearn.decomposition import PCA

# Load training set with new features into DataFrame
X_train_NLP_9feature_500_700 = pd.read_csv("./data/X_train_NLP_9feature_500-700.csv")


# correlations = X_train_NLP_9feature_500_700.corr()['Score'].sort_values(ascending=False)

# return absolute value and return the top 20 columns
correlations = X_train_NLP_9feature_500_700.corr()['Score'].abs().sort_values(ascending=False)[:20].index.tolist()
# print(correlations)


# Print the column name and correlation, in decreasing order
for col, corr in correlations.items():
    print(f"{col}: {corr:.2f}")

AttributeError: 'list' object has no attribute 'items'