In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.parser import parse
import string #To remove punctuation. NLP

metadata_file = "output/App_metadata.csv"
rel_notes_folder = "Automobile_extracts/"
NUM_SAMPLE_APPS = 30

# READ DATA

In [2]:
##Read metadata for automobile apps collected ( 30 apps from each category). 
df_metadata = pd.read_csv(metadata_file)
df_metadata
df_metadata["category"].unique() #Check categories, we are going to experiment on automotive
# array(['automotive', 'books', 'business', 'education', 'entertainment',
#        'family and relationship', 'finance', 'food', 'graphic and design',
#        'game', 'health and fitness', 'lifestyle', 'news and magazines',
#        'photo and video', 'real estate', 'shopping', 'social media',
#        'sports', 'travel', 'utility and productivity', 'games'],
#       dtype=object)

auto_meta_df = df_metadata[df_metadata["category"]== "automotive"]
auto_meta_df
app_names = auto_meta_df['title'].values

##Read the release notes of the apps

# Import libraries
import glob

# Get CSV files list from a folder
csv_files = glob.glob(rel_notes_folder + "/*.csv")

# Read each CSV file into DataFrame
# This creates a list of dataframes
df_list = (pd.read_csv(file) for file in csv_files)

#Sort the dataframe according to date decreasing order
# for i in df_list:
#     i = i.sort_values(by=['Date'], ascending=False)

big_df = pd.DataFrame() # Create empty dataframe
for i in df_list:
#     #Sort the dataframe according to date decreasing order
#     for j in range(i.shape[0]):
#         i["Date"][j] = parse(i["Date"][j])
#     i = i.sort_values(by=['Date'], ascending=False)
    # Concatenate all DataFrames
    big_df = pd.concat([big_df,i], ignore_index=True)
big_df




Unnamed: 0,App,Ver,Date,Notes
0,Simply Auto: Car Maintenance &,50.1.1.1,"Mar 24, 2022",Bug fix for overlapping text
1,Simply Auto: Car Maintenance &,49.1.1.1,"Mar 10, 2022",-Updated save icon-Increased font size on the ...
2,Simply Auto: Car Maintenance &,48.1.1.1,"Feb 17, 2022",Bugfix for Total Cost calculation
3,Simply Auto: Car Maintenance &,47.1.1.3,"Feb 7, 2022","-Easy access to delete Vehicles, Fill-ups, ser..."
4,Simply Auto: Car Maintenance &,46.1.2.1,"Jan 22, 2022",- Fixed crashes- Fixed service type delete bug
...,...,...,...,...
751,CAR PROBLEMS AND REPAIRS,2.2.1,"Nov 2, 2020",Level improvements!
752,CAR PROBLEMS AND REPAIRS,2.2.0,"Oct 24, 2020",New mechanic and reward system come with this ...
753,CAR PROBLEMS AND REPAIRS,2.1.9,"Oct 15, 2020",New features and performance improvements!
754,CAR PROBLEMS AND REPAIRS,2.1.7,"Oct 8, 2020",Performance improvements!


In [3]:
big_df.describe(include = 'all')

Unnamed: 0,App,Ver,Date,Notes
count,750,754,752,750
unique,30,640,612,389
top,Android Auto,2.1.1,"Nov 18, 2022",• Improved Do Not Disturb functionality.\n• Da...
freq,105,5,5,83


In [4]:
big_df.info()
#Drop the rows with null data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   App     750 non-null    object
 1   Ver     754 non-null    object
 2   Date    752 non-null    object
 3   Notes   750 non-null    object
dtypes: object(4)
memory usage: 23.8+ KB


# PREPROCESSING

In [5]:
#Drop the rows with null data
big_df = big_df.dropna()
big_df.info()
big_df = big_df.reset_index(drop=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 741 entries, 0 to 755
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   App     741 non-null    object
 1   Ver     741 non-null    object
 2   Date    741 non-null    object
 3   Notes   741 non-null    object
dtypes: object(4)
memory usage: 28.9+ KB


In [6]:
UNIQUE_ROWS = big_df.shape[0]
# Convert to the traditional release version
big_df = big_df.astype({'Ver':'string'}) #Convert from float to string      
for i in range(UNIQUE_ROWS):
    dot_count = big_df["Ver"][i].count('.')
#     print(dot_count)
    if dot_count ==0:
        big_df["Ver"][i] = big_df["Ver"][i]+".0.0" #append .0.0
    elif dot_count ==1:
        big_df["Ver"][i] = big_df["Ver"][i]+".0" #append .0
    dot_count = big_df["Ver"][i].count('.')
    if dot_count <2:  #Validation to check no non-traditional string
        print(big_df["Ver"][i])
    
#Add a column to find difference in dates
big_df.insert(loc = 4, column = "Time of Rel", value = np.nan, allow_duplicates=False)
for i in range(UNIQUE_ROWS - 1):
#     datetime.strptime(big_df["App"][i])
    parse(big_df["Date"][i])
    if big_df["App"][i] == big_df["App"][i + 1]:
#         datetime.strptime(date_string)
        delta = parse(big_df["Date"][i]) - parse(big_df["Date"][i + 1])
        if delta.days < 0:
            print(delta.days, "is differenc of", parse(big_df["Date"][i]), "and ", parse(big_df["Date"][i + 1]))
        else:
            big_df["Time of Rel"][i] = delta.days
big_df                

#Categorize them as major, minor, patch releases
big_df.insert(loc = 5, column = "Rel type", value = np.nan, allow_duplicates=False)
for i in range(UNIQUE_ROWS - 1):
    if big_df["App"][i] == big_df["App"][i + 1]:
        my_ver = big_df["Ver"][i]
        my_prev_ver = big_df["Ver"][i + 1]
#         if diff b/w number before 1st . is > 0:
        if int(my_ver.split(".")[0]) > int(my_prev_ver.split(".")[0]):
            big_df["Rel type"][i] = "Major"
#         elif diff b/w number after 1st before 2nd . is > 0:
        elif int(my_ver.split(".")[1]) > int(my_prev_ver.split(".")[1]):
            big_df["Rel type"][i] = "Minor"
        else:
            big_df["Rel type"][i] = "Patch"
    else: #it will bethe oldest rel note collected, so make it major. 
        big_df["Rel type"][i] = "Major"
big_df["Rel type"][UNIQUE_ROWS - 1] = "Major"
big_df 
big_df.info()

#Remove the first date in every app as we dont know the difference to its prev release.
# big_df = big_df.dropna()
# big_df.info()
# big_df = big_df.reset_index(drop=True)
# UNIQUE_ROWS = big_df.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_df["Time of Rel"][i] = delta.days
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_df["Rel type"][i] = "Major"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 741 entries, 0 to 740
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   App          741 non-null    object 
 1   Ver          741 non-null    string 
 2   Date         741 non-null    object 
 3   Notes        741 non-null    object 
 4   Time of Rel  711 non-null    float64
 5   Rel type     741 non-null    object 
dtypes: float64(1), object(4), string(1)
memory usage: 34.9+ KB


# Deriving Features

! pip uninstall stanfordcorenlp
! Conda uninstall stanfordcorenlp

In [7]:
# Stanford CoreNLP Lemmatization --Function definition'

# jahnavik$ cd Downloads/
# (base) Jahnavis-MacBook-Air:Downloads 
# jahnavik$ ls
# 1.csv					Rectangle0.66.dmg
# APPS_removed_duplicates - Apps.csv	Visual Studio Code-2.app
# Octoparse-8.5.8.dmg			Visual Studio Code.app
# R-4.2.2-arm64.pkg			cli.html
# RStudio-2022.12.0-353.dmg		stanford-corenlp-4.5.2
# (base) Jahnavis-MacBook-Air:Downloads 
# jahnavik$ cd stanford-corenlp-4.5.2/
# (base) Jahnavis-MacBook-Air:stanford-corenlp-4.5.2 
# jahnavik$ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 30000
from stanfordcorenlp import StanfordCoreNLP
import json, string

def lemmatize_corenlp(conn_nlp, sentence):
    props = {
        'annotators': 'pos,lemma',
        'pipelineLanguage': 'en',
        'outputFormat': 'json'
    }

    # tokenize into words
    sents = conn_nlp.word_tokenize(sentence)

    # remove punctuations from tokenised list
    sents_no_punct = [s for s in sents if s not in string.punctuation]

    # form sentence
    sentence2 = " ".join(sents_no_punct)

    # annotate to get lemma
    parsed_str = conn_nlp.annotate(sentence2, properties=props)
    parsed_dict = json.loads(parsed_str)

    # extract the lemma for each word
    lemma_list = [v for d in parsed_dict['sentences'][0]['tokens'] for k,v in d.items() if k == 'lemma']

    # form sentence and return it
    return " ".join(lemma_list)
# make the connection and call `lemmatize_corenlp`
nlp = StanfordCoreNLP('http://localhost', port=9000, timeout=30000)

In [8]:
#reference: https://studymachinelearning.com/cosine-similarity-text-similarity-metric/
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def create_dataframe(matrix, tokens):

    doc_names = [f'doc_{i+1}' for i, _ in enumerate(matrix)]
    df = pd.DataFrame(data=matrix, index=doc_names, columns=tokens)
    return(df)

def cos_similarity(data_1, data_2):
    data = [data_1, data_2]
    Tfidf_vect = TfidfVectorizer()
    vector_matrix = Tfidf_vect.fit_transform(data)
    tokens = Tfidf_vect.get_feature_names_out()
    create_dataframe(vector_matrix.toarray(),tokens)
    cosine_similarity_matrix = cosine_similarity(vector_matrix)
    df = create_dataframe(cosine_similarity_matrix,['data_1','data_2'])
    return df["data_1"][1]

In [9]:
#Find length, updatability, updatability for major releases
big_df.insert(loc = 6, column = "Length", value = np.nan, allow_duplicates=False)
for i in range(UNIQUE_ROWS):
    unique_words = set(big_df["Notes"][i].split())
    big_df["Length"][i] = len(unique_words)
big_df

#Remove punctuation. Stanford CoreNLP Lemmatization. Then cosine similarity b/w Ri and all prev rel. Updatability = 1 - cos simi
big_df.insert(loc = 7, column = "Processed Notes", value = np.nan, allow_duplicates=False)
for i in range(UNIQUE_ROWS):    
    big_df["Processed Notes"][i] = big_df["Notes"][i].translate(str.maketrans('', '', string.punctuation))
    big_df["Processed Notes"][i] = lemmatize_corenlp(conn_nlp=nlp, sentence=big_df["Processed Notes"][i])
big_df

big_df.insert(loc = 8, column = "Updatability", value = np.nan, allow_duplicates=False)
for i in range(UNIQUE_ROWS -1):
    if big_df["App"][i] == big_df["App"][i + 1]:
        data_1 = big_df["Processed Notes"][i]
        data_2 = big_df["Processed Notes"][i+1]
        big_df["Updatability"][i] = 1 - cos_similarity(data_1, data_2)
    else:
        big_df["Updatability"][i] = 1 #Oldest rel note we are assuming as unique
big_df["Updatability"][UNIQUE_ROWS-1] = 1 
big_df["Updatability"]

big_df.insert(loc = 9, column = "Updatability for Major Rel", value = np.nan, allow_duplicates=False)
for i in range(UNIQUE_ROWS -1):
    if big_df["App"][i] == big_df["App"][i + 1]:
        j = i + 1
        while(big_df["Rel type"][j] != "Major"):
            if ((j+1 in range(UNIQUE_ROWS)) and (big_df["App"][j] == big_df["App"][j + 1])):
                j = j + 1
            else:
                break
        if big_df["Rel type"][j] == "Major":
            data_1 = big_df["Processed Notes"][i]
            data_2 = big_df["Processed Notes"][j]
            big_df["Updatability for Major Rel"][i] = 1 - cos_similarity(data_1, data_2)
    else:
        big_df["Updatability for Major Rel"][i] = 1 #Oldest rel note we are assuming as unique
big_df["Updatability for Major Rel"][UNIQUE_ROWS-1] = 1 
big_df["Updatability for Major Rel"]
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_df["Length"][i] = len(unique_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_df["Processed Notes"][i] = big_df["Notes"][i].translate(str.maketrans('', '', string.punctuation))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_df["Updatability"][i] = 1 - cos_similarity(data_1, data_2)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.

0      0.840261
1      1.000000
2      1.000000
3      0.732882
4      0.688743
         ...   
736    1.000000
737    1.000000
738    1.000000
739    1.000000
740    1.000000
Name: Updatability for Major Rel, Length: 741, dtype: float64

In [10]:
# big_df = big_df.drop(['Length'], axis=1)
# big_df = big_df.drop(['Updatability'], axis=1)
# big_df = big_df.drop(['Updatability for Major Rel'], axis=1)
big_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 741 entries, 0 to 740
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   App                         741 non-null    object 
 1   Ver                         741 non-null    string 
 2   Date                        741 non-null    object 
 3   Notes                       741 non-null    object 
 4   Time of Rel                 711 non-null    float64
 5   Rel type                    741 non-null    object 
 6   Length                      741 non-null    float64
 7   Processed Notes             741 non-null    object 
 8   Updatability                741 non-null    float64
 9   Updatability for Major Rel  741 non-null    float64
dtypes: float64(4), object(5), string(1)
memory usage: 58.0+ KB


In [42]:
#Remove the first date in every app as we dont know the difference to its prev release.
big_df = big_df.dropna()
big_df.info()
big_df = big_df.reset_index(drop=True)
UNIQUE_ROWS = big_df.shape[0]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 711 entries, 0 to 710
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   App                         711 non-null    object 
 1   Ver                         711 non-null    string 
 2   Date                        711 non-null    object 
 3   Notes                       711 non-null    object 
 4   Time of Rel                 711 non-null    float64
 5   Rel type                    711 non-null    object 
 6   Length                      711 non-null    float64
 7   Processed Notes             711 non-null    object 
 8   Updatability                711 non-null    float64
 9   Updatability for Major Rel  711 non-null    float64
dtypes: float64(4), object(5), string(1)
memory usage: 55.7+ KB


In [25]:
#Spearman Rank Correlation Coefficient between Length, Updatability, Updatability for Major Rel
from scipy.stats import spearmanr
# calculate spearman's correlation
def SpearmanCorr(series1,series2):
    coef, p = spearmanr(series1, series2)
    print('Spearmans correlation coefficient: %.3f' % coef)
    # interpret the significance
    alpha = 0.0000 # 0.05
    if p > alpha:
        print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
    else:
        print('Samples are correlated (reject H0) p=%.3f' % p)

print("Length and Updatability:")
SpearmanCorr(big_df["Length"], big_df["Updatability"])

print("\nLength and Updatability for Major Rel:")
SpearmanCorr(big_df["Length"], big_df["Updatability for Major Rel"])

print("\nUpdatability and Updatability for Major Rel:")
SpearmanCorr(big_df["Updatability"], big_df["Updatability for Major Rel"])

Length and Updatability:
Spearmans correlation coefficient: -0.095
Samples are uncorrelated (fail to reject H0) p=0.011

Length and Updatability for Major Rel:
Spearmans correlation coefficient: -0.119
Samples are uncorrelated (fail to reject H0) p=0.001

Updatability and Updatability for Major Rel:
Spearmans correlation coefficient: 0.688
Samples are uncorrelated (fail to reject H0) p=0.000


# RQ1: Release Note Update Patterns

In [45]:
auto_meta_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 0 to 29
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   category                    30 non-null     object 
 1   app_id                      30 non-null     object 
 2   title                       30 non-null     object 
 3   score                       30 non-null     float64
 4   genre                       30 non-null     object 
 5   price                       30 non-null     float64
 6   free                        30 non-null     bool   
 7   currency                    30 non-null     object 
 8   developer                   30 non-null     object 
 9   installs                    30 non-null     object 
 10  Length                      30 non-null     float64
 11  Updatability                30 non-null     float64
 12  Updatability for Major Rel  30 non-null     float64
dtypes: bool(1), float64(5), object(7)
mem

In [44]:
#Calculate Input Fetaures for (identifying Release note patterns using) K-means clustering.
import math
# Percentage of high value(above the third quartile)
# Ascending order. Third Quartile(Q3) = (3(n + 1)/4)th Term. Percentage of releases of an app with features above the third quantile

def percent_above_3rd_qartile(column_nm):
    auto_meta_df.insert(loc = auto_meta_df.shape[1], column = column_nm, value = np.nan, allow_duplicates=False)
    column_lst = big_df[column_nm].values.tolist() #Convert to list
    # print(type(column_lst))
    column_lst.sort() #Sort in ascending order
    # print(column_lst)
    n = len(column_lst) #Total values
    Q3 = math.floor((3*(n + 1)/4)) #Third quartile
    # for each app 
    for j in range(auto_meta_df.shape[0]):
        app = auto_meta_df["title"][j]
        total = 0 
        cent = 0 #Init % of values higher than Q3
        for i in range(UNIQUE_ROWS -1):
            if big_df["App"][i] == app:
                total = total + 1
                if big_df[column_nm][i] >= cent:
                    cent = cent + 1
        auto_meta_df[column_nm][j] = ((cent / total )*100) #% of values higher than Q3
percent_above_3rd_qartile("Length")
percent_above_3rd_qartile("Updatability")
percent_above_3rd_qartile("Updatability for Major Rel")
auto_meta_df["Length"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  auto_meta_df[column_nm][j] = ((cent / total )*100) #% of values higher than Q3
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  auto_meta_df[column_nm][j] = ((cent / total )*100) #% of values higher than Q3


0      36.000000
1      70.833333
2      95.833333
3     100.000000
4      79.166667
5      45.833333
6      66.666667
7      43.478261
8      87.500000
9      37.500000
10    100.000000
11     58.333333
12     79.166667
13     80.000000
14     87.500000
15    100.000000
16     54.166667
17    100.000000
18     72.727273
19     91.666667
20     62.500000
21     20.833333
22    100.000000
23     70.833333
24    100.000000
25    100.000000
26     95.833333
27     79.166667
28    100.000000
29     95.833333
Name: Length, dtype: float64