<a href="https://colab.research.google.com/github/LorisBartesaghi/Kernel-Perceptron/blob/main/Frequent_itemsets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files

files.upload() #import the kaggle.json file

In [None]:
#import dataset
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d alvations/old-newspapers
!mkdir assignment
!unzip old-newspapers.zip -d assignment

In [None]:
#install pyspak
!pip install pyspark

In [None]:
#initiate a spark session
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .master("local[*]") \
        .config("spark.sql.execution.arrow.enabled", "true") \
        .config("spark.sql.execution.arrow.fallback.enabled", "true") \
        .getOrCreate()

In [None]:
#create an object containing the dataset downloaded from the kaggle dataset
newspapers = '/content/assignment/old-newspaper.tsv'

**DATA STRUCTURE UNDERSTANDING**

In [None]:
#understand the schema of the dataset
df = spark.read \
    .options(header = True, sep= r'\t') \
  .csv(newspapers)
df.schema

In [None]:
#show the text contained in each line
df2 = df.where(df.Language == 'English').select('Text')
df2.select('Text').limit(5).toPandas()


**DATA PREPROCESSING**

In [None]:
dataRDD = spark.sparkContext.textFile(newspapers)

In [None]:
#here I'm creating my key value pairs
def extract_text(file):
  line = file.split('\t')
  if line[0] == 'English':
    return line[3]
  else:
    return 'Pay attention, not English'


In [None]:
TextRDD = dataRDD.map(extract_text)

In [None]:
English_TextRDD = TextRDD.filter(lambda line: line != "Pay attention, not English")

In [None]:
English_TextRDD = English_TextRDD.map(lambda s: s.lower().replace('p.m.',''))
English_TextRDD = English_TextRDD.map(lambda s: s.lower().replace('a.m.',''))

In [None]:
def split(line):
    line1 = line.lower()
    punc = '!#$%&\+()*+,-./:;<=>?@[\\]^_`{|}~0123456789'
    line1 = line1.strip('"')
    for ch in punc:
        line1 = line1.replace(ch, '')    
    return line1

In [None]:
Cleaned_RDD = English_TextRDD.map(split)

In [None]:
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover()
stopwords = remover.getStopWords()

def retain_unique(line):
  words = line.split()
  new_line = []
  for word in words:
    if (line.count(word) > 1 and (word not in new_line) or line.count(word) == 1) and word not in stopwords:
        new_line.append(word)
    else:
        pass
  return new_line

In [None]:
Listed_RDD = Cleaned_RDD.map(retain_unique)


**DATA ANALYSIS**



In [None]:
#the next step is to create a unique array for counting the support for each word present in our analysis
items = Listed_RDD.flatMap(lambda line:line)

In [None]:
## Unique frequent items in dataset
uni_item = items.distinct()

In [None]:
# Method for sum in reduceByKey method
def sumOparator(x,y):
    return x+y

In [None]:
# Add 1 as Tuple
supportRdd = items.map(lambda item: (item , 1))
supportRdd = supportRdd.reduceByKey(sumOparator)

#return only the frequncy of each word
supports = supportRdd.map(lambda item: item[1])


In [None]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
cols = StructType([       
    StructField('item', StringType(), True),
    StructField('counting', IntegerType(), True)
])
deptDF = spark.createDataFrame(data = supportRdd, schema = cols)

In [None]:
from pyspark.sql import functions as F

(deptDF
    .agg(
        F.avg(F.col('counting')).alias('average frequence'),
        F.min(F.col('counting')).alias('minimin frequence'),
        F.max(F.col('counting')).alias('maximum frequence'),
    )
    .show()
)

In [None]:
deptDF.select('item').distinct().count()

In [None]:
deptDF.agg(F.sum("counting")).collect()[0][0]

In [None]:
from matplotlib import pyplot 
new = deptDF.sort(deptDF.counting.desc()).take(10)
for_barplot  = spark.createDataFrame(new,['word','counting'])

word_count = for_barplot.collect()
#create a numeric value for every label
indexes = list(range(len(word_count)))

#split words and counts to different lists 
values = [r['counting'] for r in word_count]
labels = [r['word'] for r in word_count]

#Plotting
bar_width = 0.35

pyplot.bar(indexes, values)

#add labels
labelidx = [i + bar_width for i in indexes] 
pyplot.xticks(labelidx, labels)

fig1 = pyplot.gcf()

pyplot.show()
fig1.savefig('/content/frequence1.png', bbox_inches = 'tight')


In [None]:
#files.download('frequence.png')

**BUILT-IN FUNCTIONS**

APRIORI ALGORITHM


In [None]:
from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd


In [None]:
words_basket = Listed_RDD.collect()


In [None]:
try_set = words_basket[0:10000]

In [None]:
te = TransactionEncoder()
Array = te.fit(try_set).transform(try_set, sparse=True)
sparse_df = pd.DataFrame.sparse.from_spmatrix(Array, columns=te.columns_)
res= apriori(sparse_df, min_support=0.005, use_colnames=True)

In [None]:
res_items =res[res.itemsets.apply(lambda x: len(x) > 1)]
res_items.sort_values(by=['support'],ascending=False).head(10)

FPGROWTH


In [None]:
from pyspark.ml.fpm import FPGrowth

In [None]:
df = Listed_RDD.map(lambda x: [x]).toDF(['words'])
df = df.sample(False, 0.1, seed=0)

In [None]:
fpGrowth = FPGrowth(itemsCol='words', minSupport= 0.005, minConfidence = 0.001, numPartitions= 1000)
model2 = fpGrowth.fit(df)

In [None]:
find_model = model2.associationRules
find_model.sort(find_model.support.desc()).show(20)