In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.parser import parse
import string #To remove punctuation. NLP

metadata_file = "output/App_metadata.csv"
rel_notes_folder = "Automobile_extracts/"
NUM_SAMPLE_APPS = 30

# READ DATA

In [2]:
##Read metadata for automobile apps collected ( 30 apps from each category). 
df_metadata = pd.read_csv(metadata_file)
df_metadata
df_metadata["category"].unique() #Check categories, we are going to experiment on automotive
# array(['automotive', 'books', 'business', 'education', 'entertainment',
#        'family and relationship', 'finance', 'food', 'graphic and design',
#        'game', 'health and fitness', 'lifestyle', 'news and magazines',
#        'photo and video', 'real estate', 'shopping', 'social media',
#        'sports', 'travel', 'utility and productivity', 'games'],
#       dtype=object)

auto_meta_df = df_metadata[df_metadata["category"]== "automotive"]
auto_meta_df
app_names = auto_meta_df['title'].values

##Read the release notes of the apps

# Import libraries
import glob

# Get CSV files list from a folder
csv_files = glob.glob(rel_notes_folder + "/*.csv")

# Read each CSV file into DataFrame
# This creates a list of dataframes
df_list = (pd.read_csv(file) for file in csv_files)

#Sort the dataframe according to date decreasing order
# for i in df_list:
#     i = i.sort_values(by=['Date'], ascending=False)

big_df = pd.DataFrame() # Create empty dataframe
for i in df_list:
#     #Sort the dataframe according to date decreasing order
#     for j in range(i.shape[0]):
#         i["Date"][j] = parse(i["Date"][j])
#     i = i.sort_values(by=['Date'], ascending=False)
    # Concatenate all DataFrames
    big_df = pd.concat([big_df,i], ignore_index=True)
big_df




Unnamed: 0,App,Ver,Date,Notes
0,Simply Auto: Car Maintenance &,50.1.1.1,"Mar 24, 2022",Bug fix for overlapping text
1,Simply Auto: Car Maintenance &,49.1.1.1,"Mar 10, 2022",-Updated save icon-Increased font size on the ...
2,Simply Auto: Car Maintenance &,48.1.1.1,"Feb 17, 2022",Bugfix for Total Cost calculation
3,Simply Auto: Car Maintenance &,47.1.1.3,"Feb 7, 2022","-Easy access to delete Vehicles, Fill-ups, ser..."
4,Simply Auto: Car Maintenance &,46.1.2.1,"Jan 22, 2022",- Fixed crashes- Fixed service type delete bug
...,...,...,...,...
751,CAR PROBLEMS AND REPAIRS,2.2.1,"Nov 2, 2020",Level improvements!
752,CAR PROBLEMS AND REPAIRS,2.2.0,"Oct 24, 2020",New mechanic and reward system come with this ...
753,CAR PROBLEMS AND REPAIRS,2.1.9,"Oct 15, 2020",New features and performance improvements!
754,CAR PROBLEMS AND REPAIRS,2.1.7,"Oct 8, 2020",Performance improvements!


In [3]:
big_df.describe(include = 'all')

Unnamed: 0,App,Ver,Date,Notes
count,750,754,752,750
unique,30,640,612,389
top,Android Auto,2.1.1,"Nov 18, 2022",• Improved Do Not Disturb functionality.\n• Da...
freq,105,5,5,83


In [4]:
big_df.info()
#Drop the rows with null data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   App     750 non-null    object
 1   Ver     754 non-null    object
 2   Date    752 non-null    object
 3   Notes   750 non-null    object
dtypes: object(4)
memory usage: 23.8+ KB


# PREPROCESSING

In [5]:
#Drop the rows with null data
big_df = big_df.dropna()
big_df.info()
big_df = big_df.reset_index(drop=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 741 entries, 0 to 755
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   App     741 non-null    object
 1   Ver     741 non-null    object
 2   Date    741 non-null    object
 3   Notes   741 non-null    object
dtypes: object(4)
memory usage: 28.9+ KB


In [6]:
UNIQUE_ROWS = big_df.shape[0]
# Convert to the traditional release version
big_df = big_df.astype({'Ver':'string'}) #Convert from float to string      
for i in range(UNIQUE_ROWS):
    dot_count = big_df["Ver"][i].count('.')
#     print(dot_count)
    if dot_count ==0:
        big_df["Ver"][i] = big_df["Ver"][i]+".0.0" #append .0.0
    elif dot_count ==1:
        big_df["Ver"][i] = big_df["Ver"][i]+".0" #append .0
    dot_count = big_df["Ver"][i].count('.')
    if dot_count <2:  #Validation to check no non-traditional string
        print(big_df["Ver"][i])
    
#Add a column to find difference in dates
big_df.insert(loc = 4, column = "Time of Rel", value = np.nan, allow_duplicates=False)
for i in range(UNIQUE_ROWS - 1):
#     datetime.strptime(big_df["App"][i])
    parse(big_df["Date"][i])
    if big_df["App"][i] == big_df["App"][i + 1]:
#         datetime.strptime(date_string)
        delta = parse(big_df["Date"][i]) - parse(big_df["Date"][i + 1])
        if delta.days < 0:
            print(delta.days, "is differenc of", parse(big_df["Date"][i]), "and ", parse(big_df["Date"][i + 1]))
        else:
            big_df["Time of Rel"][i] = delta.days
big_df                

#Categorize them as major, minor, patch releases
big_df.insert(loc = 5, column = "Rel type", value = np.nan, allow_duplicates=False)
for i in range(UNIQUE_ROWS - 1):
    if big_df["App"][i] == big_df["App"][i + 1]:
        my_ver = big_df["Ver"][i]
        my_prev_ver = big_df["Ver"][i + 1]
#         if diff b/w number before 1st . is > 0:
        if int(my_ver.split(".")[0]) > int(my_prev_ver.split(".")[0]):
            big_df["Rel type"][i] = "Major"
#         elif diff b/w number after 1st before 2nd . is > 0:
        elif int(my_ver.split(".")[1]) > int(my_prev_ver.split(".")[1]):
            big_df["Rel type"][i] = "Minor"
        else:
            big_df["Rel type"][i] = "Patch"
big_df 
big_df.info()

#Remove the first date in every app as we dont know the difference to its prev release.
# big_df = big_df.dropna()
# big_df.info()
# big_df = big_df.reset_index(drop=True)
# UNIQUE_ROWS = big_df.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_df["Time of Rel"][i] = delta.days


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 741 entries, 0 to 740
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   App          741 non-null    object 
 1   Ver          741 non-null    string 
 2   Date         741 non-null    object 
 3   Notes        741 non-null    object 
 4   Time of Rel  711 non-null    float64
 5   Rel type     711 non-null    object 
dtypes: float64(1), object(4), string(1)
memory usage: 34.9+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_df["Rel type"][i] = "Major"


! pip uninstall stanfordcorenlp
! Conda uninstall stanfordcorenlp

In [3]:
# Stanford CoreNLP Lemmatization --Function definition
from stanfordcorenlp import StanfordCoreNLP
import json, string

def lemmatize_corenlp(conn_nlp, sentence):
    props = {
        'annotators': 'pos,lemma',
        'pipelineLanguage': 'en',
        'outputFormat': 'json'
    }

    # tokenize into words
    sents = conn_nlp.word_tokenize(sentence)

    # remove punctuations from tokenised list
    sents_no_punct = [s for s in sents if s not in string.punctuation]

    # form sentence
    sentence2 = " ".join(sents_no_punct)

    # annotate to get lemma
    parsed_str = conn_nlp.annotate(sentence2, properties=props)
    parsed_dict = json.loads(parsed_str)

    # extract the lemma for each word
    lemma_list = [v for d in parsed_dict['sentences'][0]['tokens'] for k,v in d.items() if k == 'lemma']

    # form sentence and return it
    return " ".join(lemma_list)
# make the connection and call `lemmatize_corenlp`
nlp = StanfordCoreNLP('http://localhost', port=9000, timeout=30000)

ModuleNotFoundError: No module named 'stanfordcorenlp'

In [28]:
#Find length, updatability, updatability for major releases
big_df.insert(loc = 6, column = "Length", value = np.nan, allow_duplicates=False)
for i in range(UNIQUE_ROWS):
    unique_words = set(big_df["Notes"][i].split())
    big_df["Length"][i] = len(unique_words)
big_df

#Remove punctuation. Stanford CoreNLP Lemmatization. Then cosine similarity b/w Ri and all prev rel. Updatability = 1 - cos simi
big_df.insert(loc = 7, column = "Processed Notes", value = np.nan, allow_duplicates=False)
for i in range(UNIQUE_ROWS):    
    big_df["Processed Notes"][i] = big_df["Notes"][i].translate(str.maketrans('', '', string.punctuation))
    big_df["Processed Notes"][i] = lemmatize_corenlp(conn_nlp=nlp, sentence=big_df["Processed Notes"][i])
big_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_df["Length"][i] = len(unique_words)


Unnamed: 0,App,Ver,Date,Notes,Time of Rel,Rel type,Length,Processed Notes
0,Simply Auto: Car Maintenance &,50.1.1.1,"Mar 24, 2022",Bug fix for overlapping text,14.0,Major,5.0,Bug fix for overlapping text
1,Simply Auto: Car Maintenance &,49.1.1.1,"Mar 10, 2022",-Updated save icon-Increased font size on the ...,21.0,Major,11.0,Updated save iconIncreased font size on the lo...
2,Simply Auto: Car Maintenance &,48.1.1.1,"Feb 17, 2022",Bugfix for Total Cost calculation,10.0,Major,5.0,Bugfix for Total Cost calculation
3,Simply Auto: Car Maintenance &,47.1.1.3,"Feb 7, 2022","-Easy access to delete Vehicles, Fill-ups, ser...",16.0,Major,11.0,Easy access to delete Vehicles Fillups service...
4,Simply Auto: Car Maintenance &,46.1.2.1,"Jan 22, 2022",- Fixed crashes- Fixed service type delete bug,10.0,Major,7.0,Fixed crashes Fixed service type delete bug
...,...,...,...,...,...,...,...,...
736,CAR PROBLEMS AND REPAIRS,2.2.1,"Nov 2, 2020",Level improvements!,9.0,Patch,2.0,Level improvements
737,CAR PROBLEMS AND REPAIRS,2.2.0,"Oct 24, 2020",New mechanic and reward system come with this ...,9.0,Minor,9.0,New mechanic and reward system come with this ...
738,CAR PROBLEMS AND REPAIRS,2.1.9,"Oct 15, 2020",New features and performance improvements!,7.0,Patch,5.0,New features and performance improvements
739,CAR PROBLEMS AND REPAIRS,2.1.7,"Oct 8, 2020",Performance improvements!,12.0,Patch,2.0,Performance improvements
