In [63]:
import sys
!{sys.executable} -m pip install pyspark



In [64]:

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from time import sleep
from datetime import datetime, timedelta

spark = SparkSession \
    .builder \
    .appName("Github commits message dataset ESGI Project - Group 6") \
    .master("local[*]") \
    .getOrCreate()
    # .master("spark://spark-master:7077") \
    # .config("spark.executor.memory", "4g") \
    # .config("spark.executor.cores", 6) \



In [65]:

full_file = "./data/full.csv"

#/ commit,author,date,message,repo
schema = StructType([
    StructField("commit", StringType(), True),
    StructField("author", StringType(), True),
    StructField("date", StringType(), True),
    StructField("message", StringType(), True),
    StructField("repo", StringType(), True)
])

# 1
df = spark.read \
    .option("header", "true") \
    .option("multiLine", "true") \
    .schema(schema) \
    .load(full_file, format="csv")


In [66]:

# 1. Afficher dans la console les 10 projets Github pour lesquels il y a eu le plus de
# commit.
tenProjectWithMostCommits = df.groupBy("repo") \
    .agg(count("commit").alias("count")) \
    .orderBy(desc("count"))
tenProjectWithMostCommits.show(n=10, truncate=False)

+---------------------+-------+
|repo                 |count  |
+---------------------+-------+
|null                 |5971457|
|chromium/chromium    |895846 |
|torvalds/linux       |816748 |
|llvm/llvm-project    |355433 |
|freebsd/freebsd-src  |242564 |
|openbsd/src          |203484 |
|gcc-mirror/gcc       |174277 |
|rust-lang/rust       |136070 |
|apple/swift          |113355 |
|tensorflow/tensorflow|105709 |
+---------------------+-------+
only showing top 10 rows



In [67]:
# 2. Afficher dans la console le plus gros contributeur (la personne qui a fait le plus de
# commit) du projet apache/spark.
biggestContributor = df.filter(df.repo == "apache/spark") \
    .groupBy("author") \
    .agg(count("commit").alias("count")) \
    .orderBy(desc("count"))
biggestContributor.show(n=1, truncate=False)

+---------------------------------------+-----+
|author                                 |count|
+---------------------------------------+-----+
|Matei Zaharia <matei@eecs.berkeley.edu>|1316 |
+---------------------------------------+-----+
only showing top 1 row



In [68]:

# 3. Afficher dans la console les plus gros contributeurs du projet apache/spark sur les 4
# dernières années. Pas de date en dur dans le code . Pour la conversion vous
# pouvez vous référer à cette documentation.
dateFourYearsAgo = datetime.now() - timedelta(days=4*365)
formatted_dt = dateFourYearsAgo.strftime("%a %b %d %H:%M:%S %Y %z")
print("Date four years ago: ", formatted_dt)


# date in csv looks like : Wed Apr 21 12:27:07 2021 +0800
biggestContributorLastFourYears = df.filter(df.repo == "apache/spark") \
    .filter(df.date >= formatted_dt) \
    .groupBy("author") \
    .agg(count("commit").alias("count")) \
    .orderBy(desc("count"))
biggestContributorLastFourYears.show(n=10, truncate=False)

Date four years ago:  Tue May 14 14:28:52 2019 
+-------------------------------------------+-----+
|author                                     |count|
+-------------------------------------------+-----+
|Matei Zaharia <matei@eecs.berkeley.edu>    |262  |
|Wenchen Fan <wenchen@databricks.com>       |181  |
|Reynold Xin <rxin@databricks.com>          |168  |
|Patrick Wendell <pwendell@gmail.com>       |142  |
|Tathagata Das <tathagata.das1565@gmail.com>|121  |
|Liang-Chi Hsieh <viirya@gmail.com>         |105  |
|Davies Liu <davies@databricks.com>         |93   |
|Xiangrui Meng <meng@databricks.com>        |90   |
|Marcelo Vanzin <vanzin@cloudera.com>       |80   |
|Yanbo Liang <ybliang8@gmail.com>           |78   |
+-------------------------------------------+-----+
only showing top 10 rows



In [70]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, split, explode, length, array
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import desc
import os

# Process commit messages: split into words, lower case, remove punctuation
words = df.select(explode(split(lower(regexp_replace(col('message'), '[^\w\s]', ' ')), ' ')).alias('word'))

# Remove empty strings and non-alphabetic "words"
words = words.filter((length(col('word')) > 0) & (col('word').rlike('[a-z]'))).withColumn('word', trim(col('word')))

# Convert each row into a list of words
words = words.withColumn('word', array('word'))
print("Filtered words with punctuation removed and empty strings removed")
print(words)

# Directory containing the stopword text files
stopwords_dir = './data/stop_words/'

# Read all stopword files and combine into a single list
stopwords_list = []
for filename in os.listdir(stopwords_dir):
    if filename.endswith('.txt'):
        with open(os.path.join(stopwords_dir, filename), 'r', encoding='ISO-8859-1') as f:
            stopwords_list += [word.strip() for word in f.readlines()]
print("Stop words list")
print(stopwords_list)

# Use the loaded stop words list
remover = StopWordsRemover(stopWords=stopwords_list)
remover.setInputCol('word')
remover.setOutputCol('filtered')

# Remove stop words
words_no_stop_words = remover.transform(words)

# Make column "filtered" as string
words_no_stop_words = words_no_stop_words.withColumn('filtered', explode('filtered'))

# Count word frequencies and show the top 10
words_no_stop_words.groupBy('filtered').count().orderBy(desc('count')).show(10)


Filtered words with punctuation removed and empty strings removed
DataFrame[word: array<string>]
Stop words list
['a', 'à', 'â', 'abord', 'afin', 'ah', 'ai', 'aie', 'ainsi', 'allaient', 'allo', 'allô', 'allons', 'après', 'assez', 'attendu', 'au', 'aucun', 'aucune', 'aujourd', "aujourd'hui", 'auquel', 'aura', 'auront', 'aussi', 'autre', 'autres', 'aux', 'auxquelles', 'auxquels', 'avaient', 'avais', 'avait', 'avant', 'avec', 'avoir', 'ayant', 'b', 'bah', 'beaucoup', 'bien', 'bigre', 'boum', 'bravo', 'brrr', 'c', 'ça', 'car', 'ce', 'ceci', 'cela', 'celle', 'celle-ci', 'celle-là', 'celles', 'celles-ci', 'celles-là', 'celui', 'celui-ci', 'celui-là', 'cent', 'cependant', 'certain', 'certaine', 'certaines', 'certains', 'certes', 'ces', 'cet', 'cette', 'ceux', 'ceux-ci', 'ceux-là', 'chacun', 'chaque', 'cher', 'chère', 'chères', 'chers', 'chez', 'chiche', 'chut', 'ci', 'cinq', 'cinquantaine', 'cinquante', 'cinquantième', 'cinquième', 'clac', 'clic', 'combien', 'comme', 'comment', 'compris', 'co