# Prepare for data analyze

In [1]:
// ML.NET Nuget packages installation
#r "nuget:Microsoft.ML"
    
//Install XPlot package
#r "nuget:XPlot.Plotly"

//Install Recommender
#r "nuget:Microsoft.ML.Recommender"

using Microsoft.ML;
using Microsoft.ML.Recommender;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using System.Linq;
using System.Text.Json;
using System.IO;
using XPlot.Plotly;

## Models

In [1]:
public class ReviewModel
{
    public string reviewerID { get; set; }
    public string asin { get; set; }
    public string reviewerName { get; set; }
    public int[] helpful { get; set; }
    public string reviewText { get; set; }
    /// <summary>
    /// Оценка
    /// </summary>
    public float overall { get; set; }
    public string summary { get; set; }
    public int unixReviewTime { get; set; }
    public string reviewTime { get; set; }
}
public class ProductPrediction
{
    public string asin;
    public float Score;
}

## Load Data

In [1]:
var fileName = "./Data/Clothing_Shoes_and_Jewelry_5.json";
var jsonString = File.ReadAllText(fileName);
jsonString = jsonString.Replace('\n', ',');
var reviewsSrc = Newtonsoft.Json.JsonConvert.DeserializeObject<List<ReviewModel>>("["+jsonString+"]");
MLContext mlContext = new MLContext();
int countTest = reviewsSrc.Count()/8;
var trainDataView = mlContext.Data.LoadFromEnumerable<ReviewModel>(reviewsSrc);
var testDataView = mlContext.Data.LoadFromEnumerable<ReviewModel>(reviewsSrc.Take(countTest));
display("Schema of training DataView:");
display(trainDataView.Preview(1).RowView);

Schema of training DataView:

index,Values
0,"[ reviewerID: A1KLRMWW2FWPL4, asin: 0000031887, reviewerName: Amazon Customer ""cameramom"", helpful: Dense vector of size 2, reviewText: This is a great tutu and at a really great price. It doesn't look cheap at all. I'm so glad I looked on Amazon and found such an affordable tutu that isn't made poorly. A++, overall: 5, summary: Great tutu- not cheaply made, unixReviewTime: 1297468800, reviewTime: 02 12, 2011 ]"


In [1]:
# Histograms

Error: [object Object]

## Get columns

In [1]:
//Extract some data into arrays for plotting:

int numberOfRows = 1000;
float[] scores = trainDataView.GetColumn<float>(nameof(ReviewModel.overall)).Take(numberOfRows).ToArray();
int[] times = trainDataView.GetColumn<int>(nameof(ReviewModel.unixReviewTime)).Take(numberOfRows).ToArray();
string[] reviews = trainDataView.GetColumn<string>(nameof(ReviewModel.reviewText)).Take(numberOfRows).ToArray();
string[] names = trainDataView.GetColumn<string>(nameof(ReviewModel.reviewerName)).Take(numberOfRows).ToArray();

## Histogram numbers of scores

In [1]:
// Distribution of number of scores
//XPlot Histogram reference: http://tpetricek.github.io/XPlot/reference/xplot-plotly-graph-histogram.html

var faresHistogram = Chart.Plot(new Histogram(){x = scores, autobinx = false, nbinsx = 20});
var layout = new Layout.Layout(){title="Number of scores"};
faresHistogram.WithLayout(layout);
faresHistogram.WithXTitle("Scores");
faresHistogram.WithYTitle("Numbers");
faresHistogram.Show();
display(faresHistogram);

Height,Id,PlotlySrc,Width
500,437ff2a3-66ca-4dfa-856f-3cae80c73bb3,https://cdn.plot.ly/plotly-latest.min.js,900


## Score and Review Length

In [1]:
var chartFareVsTime = Chart.Plot(
    new Scatter()
    {
        x =scores.Take(200) ,
        y = reviews.Select(x=>x.Length).Take(200).ToArray(),
        mode = "markers",
        marker = new Marker()
        {
            color = scores,
            colorscale = "Jet"
        }
    }
);

var layout = new Layout.Layout(){title="Plot score and review length"};
chartFareVsTime.WithLayout(layout);
chartFareVsTime.Width = 500;
chartFareVsTime.Height = 500;
chartFareVsTime.WithXTitle("Scores");
chartFareVsTime.WithYTitle("Review Length");
chartFareVsTime.WithLegend(false);
chartFareVsTime.Show();
display(chartFareVsTime);

Height,Id,PlotlySrc,Width
500,36352054-7ce7-412d-b5df-ce1934fac704,https://cdn.plot.ly/plotly-latest.min.js,500


In [1]:
//var collection = products.GroupBy(x=>x.ProductId).OrderByDescending(x=>x.Count());
//var scorses = collection.First(x=>x.Any(y=>y.Label<5)).Select(x=>x.Label).Take(500);
//display(scorses);
var chartFareVsTime = Chart.Plot(
    new Box()
    {
        x =scores ,
        y = reviews.Select(x=>x.Length).Where(x=>x<2000).ToArray()
    }
);

var layout = new Layout.Layout(){title="Plot score and review length"};
chartFareVsTime.WithLayout(layout);
chartFareVsTime.Width = 500;
chartFareVsTime.Height = 500;
chartFareVsTime.WithXTitle("Scores");
chartFareVsTime.WithYTitle("Review Length");
chartFareVsTime.WithLegend(false);
chartFareVsTime.Show();
display(chartFareVsTime);

Height,Id,PlotlySrc,Width
500,b27e193c-2ff3-4321-8e30-bb174521fede,https://cdn.plot.ly/plotly-latest.min.js,500


# Build and train model

In [1]:
IEstimator<ITransformer> estimator = mlContext
    .Transforms
    .Conversion
    .MapValueToKey(outputColumnName: "ReviewerIdEncoded", inputColumnName: $"{nameof(ReviewModel.reviewerID)}")
    .Append(mlContext
        .Transforms
        .Conversion
        .MapValueToKey(outputColumnName: "ProductIdEncoded", inputColumnName: $"{nameof(ReviewModel.asin)}"));
var options = new MatrixFactorizationTrainer.Options
{
    MatrixColumnIndexColumnName = "ReviewerIdEncoded",
    MatrixRowIndexColumnName = "ProductIdEncoded",
    LabelColumnName = $"{nameof(ReviewModel.overall)}",
    NumberOfIterations = 20,
    ApproximationRank = 100
};
var trainerEstimator = estimator
    .Append(mlContext
        .Recommendation()
        .Trainers
        .MatrixFactorization(options));
Console.WriteLine("========================== Training the model =============================");
ITransformer model = trainerEstimator.Fit(trainDataView);
display(model.GetOutputSchema(trainDataView.Schema))



index,Name,Index,IsHidden,Type,Annotations
0,reviewerID,0,False,String,
1,asin,1,False,String,
2,reviewerName,2,False,String,
3,helpful,3,False,Vector<Int32>,
4,reviewText,4,False,String,
5,overall,5,False,Single,
6,summary,6,False,String,
7,unixReviewTime,7,False,Int32,
8,reviewTime,8,False,String,
9,ReviewerIdEncoded,9,False,"Key<UInt32, 0-39386>",KeyValues


## Evaluate model

In [1]:
public static void EvaluateModel(MLContext mlContext, IDataView testDataView, ITransformer model)
{
    Console.WriteLine("========================== Evaluating the model =============================");
    var prediction = model.Transform(testDataView);
    var metrics = mlContext
        .Regression
        .Evaluate(prediction, labelColumnName: $"{nameof(ReviewModel.overall)}", scoreColumnName: "Score");
    Console.WriteLine("Root Mean Squared Error : " + metrics.RootMeanSquaredError.ToString());
    Console.WriteLine("RSquared: " + metrics.RSquared.ToString());
}
EvaluateModel(mlContext, testDataView, model);



Root Mean Squared Error : 0.36576310438437654


RSquared: 0.8836388861942855


# Predict by model

## Single prediction

In [1]:
public static void UseModelForSinglePrediction(MLContext mlContext, ITransformer model, IEnumerable<ReviewModel> products)
{
    Console.WriteLine("=========================== Making a prediction =============================");
    var predictionEngine = mlContext
        .Model
        .CreatePredictionEngine<ReviewModel, ProductPrediction>(model);

    foreach(var product in products.GroupBy(x=>x.asin).Select(x=>x.Key).Take(10))
    {
        var testInput = new ReviewModel { reviewerID = "A1KLRMWW2FWPL4", asin = product};
        var movieRatingPrediction = predictionEngine.Predict(testInput);
        if (Math.Round(movieRatingPrediction.Score, 1) > 3.5)
        {
            Console.WriteLine("Product " + testInput.asin + " is recommended for user " + testInput.reviewerID + ". Score=" + movieRatingPrediction.Score);
        }
        else
        {
            Console.WriteLine("Product " + testInput.asin + " is not recommended for user " + testInput.reviewerID + ". Score=" + movieRatingPrediction.Score);
        }
    }
}
UseModelForSinglePrediction(mlContext, model, reviewsSrc);



Product 0000031887 is recommended for user A1KLRMWW2FWPL4. Score=4.721161


Product 0123456479 is recommended for user A1KLRMWW2FWPL4. Score=4.6124706


Product 1608299953 is recommended for user A1KLRMWW2FWPL4. Score=3.8849292


Product 1617160377 is recommended for user A1KLRMWW2FWPL4. Score=4.823443


Product B00001W0KA is recommended for user A1KLRMWW2FWPL4. Score=4.8082485


Product B00001WRHJ is recommended for user A1KLRMWW2FWPL4. Score=4.034339


Product B00004SR8W is recommended for user A1KLRMWW2FWPL4. Score=4.684068


Product B00004SR8Z is recommended for user A1KLRMWW2FWPL4. Score=4.2584906


Product B00004SR9P is recommended for user A1KLRMWW2FWPL4. Score=3.8755589


Product B00004U1J2 is not recommended for user A1KLRMWW2FWPL4. Score=3.5261781


## SaveModel

In [1]:
var modelPath = "./Data/MovieRecommenderModel.zip";
public static void SaveModel(MLContext mlContext, DataViewSchema trainDataViewSchema, ITransformer model, string modelPath)
{

    Console.WriteLine("========================== Saving the model to a file ==================================");
    mlContext
        .Model
        .Save(model, trainDataViewSchema, modelPath);
}
SaveModel(mlContext, trainDataView.Schema, model, modelPath);



## Get Model from a file

In [1]:
MLContext mLContext = new MLContext();
DataViewSchema schema;
var model1 = mlContext.Model.Load(modelPath, out schema);

## Evaluate model from a file

In [1]:
EvaluateModel(mlContext, testDataView, model1);



Root Mean Squared Error : 0.36576310438437654


RSquared: 0.8836388861942855
