In [1]:
from pyspark.sql import SparkSession

In [2]:
hadoop_aws_ver = '3.3.2'
spark = SparkSession \
    .builder \
    .appName("apollo-simge") \
    .config("spark.jars.packages", f"org.apache.hadoop:hadoop-aws:{hadoop_aws_ver},org.apache.hadoop:hadoop-common:{hadoop_aws_ver},org.apache.hadoop:hadoop-client:{hadoop_aws_ver}") \
    .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.DefaultAWSCredentialsProviderChain') \
    .config('spark.driver.extraJavaOptions', '-Dio.netty.tryReflectionSetAccessible=true') \
    .config('spark.executor.extraJavaOptions', '-Dio.netty.tryReflectionSetAccessible=true') \
    .config('spark.driver.memory', '8G') \
    .getOrCreate()



:: loading settings :: url = jar:file:/usr/local/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/simgeaciliogluhosgor/.ivy2/cache
The jars for the packages stored in: /Users/simgeaciliogluhosgor/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.hadoop#hadoop-common added as a dependency
org.apache.hadoop#hadoop-client added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c2cdc61c-170c-4ae0-aa57-975c21d378f6;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.2 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.1026 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.apache.hadoop#hadoop-common;3.3.2 in central
	found org.apache.hadoop.thirdparty#hadoop-shaded-protobuf_3_7;1.1.1 in central
	found org.apache.hadoop#hadoop-annotations;3.3.2 in central
	found org.apache.hadoop.thirdparty#hadoop-shaded-guava;1.1.1 in central
	found com.google.guava#guava;27.0-jre in central
	found com.google.guava#failureaccess;1.0 in central
	found com.google.gu

	0 artifacts copied, 116 already retrieved (0kB/26ms)
22/03/28 01:35:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
df = spark.read.parquet("") #filepath

                                                                                

In [25]:
rdd = df.rdd

def mapper(func):
    def fixup_lang(lang): return lang if lang != 'h' else 'c'
    return lambda row: (fixup_lang(row['language'].lower()), (func(row), 1, row['fileName']))

def discardFilename(row):
    return (row[0], row[1][0:2])

def discardFileCount(row):
    return (row[0], (row[1][0], row[1][2]))

def reducePair(x, y):
    return (x[0] + y[0], x[1] + y[1])

def getAnalytics(specific_mapper_func):
    result = {}
    
    mappedRdd = rdd.map(mapper(specific_mapper_func))
    
    fileCount = mappedRdd.map(discardFilename).reduceByKey(reducePair)
    result['fileCount'] = fileCount.collectAsMap()
    
    average = lambda x: (x[0], x[1][0] / x[1][1])
    
    averagePerFile = fileCount.map(average)
    result['averagePerFile'] = averagePerFile.collectAsMap()
    
    getMax = lambda x, y: x if x[0] > y[0] else y
    getMin = lambda x, y: y if x[0] > y[0] else x
    
    maxFile = mappedRdd.map(discardFileCount).reduceByKey(getMax)
    result['maxFile'] = maxFile.collectAsMap()
    
    minFile = mappedRdd.map(discardFileCount).reduceByKey(getMin)
    result['minFile'] = minFile.collectAsMap()
    
    def valSubMeanSqr(x):
        stats = result['fileCount'][x[0]]
        avg = stats[0] / stats[1]
        cnt = stats[1]
        return x[0], ((x[1][0] - avg) ** 2) / cnt

    from math import sqrt
    stddev = mappedRdd.map(valSubMeanSqr).reduceByKey(lambda x, y: x + y).map(lambda x: (x[0], sqrt(x[1])))
    result['stddev'] = stddev.collectAsMap()

    return result, mappedRdd

In [26]:
def analyticsToDF(result_dict):
    import pandas as pd
    ret = pd.DataFrame()
    for key, value in result_dict.items():
        ret[key] = value.items()
    return ret

In [27]:
wordCountAnalytics, _ = getAnalytics(
    lambda x: len([word for word in x['content'].split() if any(char.isalnum() for char in word)]))

                                                                                

In [28]:
analyticsToDF(wordCountAnalytics)

Unnamed: 0,fileCount,averagePerFile,maxFile,minFile,stddev
0,"(go, (120720809, 145481))","(go, 829.8046411558897)","(go, (149981, mattermost/mattermost-server/ven...","(go, (4, jesseduffield/lazygit/vendor/github.c...","(go, 3077.4473765876473)"
1,"(java, (99082288, 285483))","(java, 347.06896032338176)","(java, (92630, ReactiveX/RxJava/src/main/java/...","(java, (1, JetBrains/intellij-community/java/j...","(java, 783.843042363129)"
2,"(json, (55995767, 123717))","(json, 452.61174292942763)","(json, (92513, plotly/plotly.js/test/image/moc...","(json, (0, envoyproxy/envoy/test/common/json/j...","(json, 2295.8650597705678)"
3,"(c++, (80753408, 102855))","(c++, 785.1189344222449)","(c++, (76290, MarlinFirmware/Marlin/Marlin/src...","(c++, (1, CRYTEK/CRYENGINE/Code/CryEngine/CryA...","(c++, 1887.681692027397)"
4,"(c, (176239271, 186286))","(c, 946.0682552634122)","(c, (158539, arangodb/arangodb/arangod/IResear...","(c, (0, coolsnowwolf/lede/package/lean/mt/driv...","(c, 2644.7094094118743)"
5,"(javascript, (116910771, 278657))","(javascript, 419.5508133655354)","(javascript, (160155, octobercms/october/modul...","(javascript, (0, webpack/webpack/test/statsCas...","(javascript, 2303.2706308001707)"
6,"(markdown, (64364783, 108189))","(markdown, 594.9290870606069)","(markdown, (108190, jgm/pandoc/changelog.md))","(markdown, (0, prettier/prettier/tests/markdow...","(markdown, 1916.1295626313513)"
7,"(python, (54311104, 89750))","(python, 605.1376490250697)","(python, (59905, python/cpython/Lib/pydoc_data...","(python, (0, pytorch/pytorch/caffe2/python/bui...","(python, 1369.0119376268153)"
8,"(yaml, (8403805, 28588))","(yaml, 293.9626766475444)","(yaml, (74235, kubernetes/kops/addons/promethe...","(yaml, (1, JetBrains/intellij-community/plugin...","(yaml, 1806.4060698982705)"
9,"(rust, (7384468, 22194))","(rust, 332.7236189961251)","(rust, (65560, servo/servo/components/script/u...","(rust, (1, rust-lang/rust/src/test/rustdoc/iss...","(rust, 1028.0239093870887)"


In [29]:
lineCountAnalytics, _ = getAnalytics(
    lambda x: len([line for line in x['content'].split('\n') if line]))

                                                                                

In [30]:
analyticsToDF(lineCountAnalytics)

Unnamed: 0,fileCount,averagePerFile,maxFile,minFile,stddev
0,"(go, (36546270, 145481))","(go, 251.20991744626446)","(go, (122875, mattermost/mattermost-server/ven...","(go, (2, golang/go/test/dwarf/dwarf.dir/z10.go))","(go, 1105.8536246199292)"
1,"(java, (32762612, 285483))","(java, 114.7620418728961)","(java, (25011, apache/hadoop/hadoop-yarn-proje...","(java, (1, JetBrains/intellij-community/platfo...","(java, 276.8333156296245)"
2,"(json, (32540768, 123717))","(json, 263.0258412344302)","(json, (58290, pingcap/tidb/planner/core/testd...","(json, (1, apache/spark/python/test_support/sq...","(json, 1434.0850371159393)"
3,"(c++, (28517563, 102855))","(c++, 277.25986096932576)","(c++, (23485, arangodb/arangodb/3rdParty/V8/v7...","(c++, (1, CRYTEK/CRYENGINE/Code/CryEngine/CryA...","(c++, 641.0618527311872)"
4,"(c, (53561635, 186286))","(c, 287.52367327657475)","(c, (41188, ruby/ruby/enc/unicode/12.1.0/name2...","(c, (1, arangodb/arangodb/3rdParty/boost/1.71....","(c, 735.8978195100211)"
5,"(javascript, (40262479, 278657))","(javascript, 144.4875922729377)","(javascript, (59175, keystonejs/keystone-class...","(javascript, (1, prettier/prettier/tests/js/cu...","(javascript, 756.8072002834125)"
6,"(markdown, (11268062, 108189))","(markdown, 104.15164203384818)","(markdown, (16849, apachecn/AiLearning/docs/tf...","(markdown, (1, arangodb/arangodb/Documentation...","(markdown, 290.9212870230607)"
7,"(python, (16713619, 89750))","(python, 186.22416713091923)","(python, (22430, psf/black/profiling/list_huge...","(python, (0, pytorch/pytorch/caffe2/python/bui...","(python, 413.34188371789793)"
8,"(yaml, (2782421, 28588))","(yaml, 97.32828459493494)","(yaml, (23645, Kong/kong/spec/fixtures/burst.y...","(yaml, (1, microsoft/winget-cli/src/AppInstall...","(yaml, 550.3695080183901)"
9,"(rust, (2661591, 22194))","(rust, 119.92389835090565)","(rust, (10414, rust-lang/rust/src/test/ui/issu...","(rust, (1, rust-lang/rust/src/test/ui/generics...","(rust, 329.913186555851)"
