In [55]:
import unicodedata
import re
import json
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import acquire_dr
import prepare_dr
#import prepare_jag

from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from time import strftime
from wordcloud import WordCloud
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
df = pd.read_csv('raw_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       151 non-null    int64 
 1   repo             151 non-null    object
 2   language         143 non-null    object
 3   readme_contents  147 non-null    object
dtypes: int64(1), object(3)
memory usage: 4.8+ KB


In [3]:
df.language.value_counts()

Swift               65
Python              38
C++                 15
C                   11
JavaScript           2
Java                 2
Jupyter Notebook     2
HTML                 2
Dockerfile           1
Markdown             1
Shell                1
R                    1
Starlark             1
LLVM                 1
Name: language, dtype: int64

In [4]:
df[df.language.isnull()]

Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents
0,0,apple/llvm-project,,# Apple's fork of llvm-project\n\nThis is Appl...
13,13,apple/swift-docc-render-artifact,,# Swift-DocC-Render-Artifact\n\nThis repositor...
14,14,apple/swift-lmdb,,# CLMDB\n\nCLMDB is a SwiftPM package wrapper ...
83,83,apple/swift-community-hosted-continuous-integr...,,# Swift Community-Hosted Continuous Integratio...
123,123,apple/ml-transcript-translation-consistency-ra...,,# Human Ratings of Transcription/Translation C...
139,139,apple/llvm-monorepo-root,,# Disclaimer\n\nThe [llvm-monorepo-root](https...
145,145,apple/swift-protobuf-plugin,,"<img src=""https://swift.org/assets/images/swif..."
149,149,apple/swift-protobuf-test-conformance,,"<img src=""https://swift.org/assets/images/swif..."


#### Observations:

- There are some repositories lacking a specified language, not sure how to proceed there but there's only 8 rows missing languages
    - an idea is to infer the language based on the repository title
- llvm project does not specify language but readme states the repo contains source code for LLVM, so go with `LLVM`?
- swift docc render artifact does not specify language but the readme states that the repository holds a pre-built copy of the swift docc render repo, which is majority `javascript`
- swift imdb is `C`, not sure why the acquire function didn't pick it up
- swift community hosted continuous integration is an extension of Swift ci which allows community memberts to add platforms.
    - Seems like the most recent uploads to the swift ci site are `swift` based files.
- ml transcript translation's repo looks like is just csv files
- llvm monorepo root is a wip repository for apple's open source `Swift` project.
- swift protobuf plugin was combined under the swift-protobuf repo, which is majority `swift` programming language
- swift protobuf test conformance was combined under the swift-protobuf repo, which is majority `swift` programming language

In [5]:
df[df.index == 83].repo

83    apple/swift-community-hosted-continuous-integr...
Name: repo, dtype: object

In [6]:
nan_languages = list(df[df.language.isnull()].index)

In [7]:
for i in nan_languages:
    display(df[df.index == i])

Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents
0,0,apple/llvm-project,,# Apple's fork of llvm-project\n\nThis is Appl...


Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents
13,13,apple/swift-docc-render-artifact,,# Swift-DocC-Render-Artifact\n\nThis repositor...


Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents
14,14,apple/swift-lmdb,,# CLMDB\n\nCLMDB is a SwiftPM package wrapper ...


Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents
83,83,apple/swift-community-hosted-continuous-integr...,,# Swift Community-Hosted Continuous Integratio...


Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents
123,123,apple/ml-transcript-translation-consistency-ra...,,# Human Ratings of Transcription/Translation C...


Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents
139,139,apple/llvm-monorepo-root,,# Disclaimer\n\nThe [llvm-monorepo-root](https...


Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents
145,145,apple/swift-protobuf-plugin,,"<img src=""https://swift.org/assets/images/swif..."


Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents
149,149,apple/swift-protobuf-test-conformance,,"<img src=""https://swift.org/assets/images/swif..."


In [8]:
# let's override the languages with the observations noted
df.language.loc[0] = 'LLVM'
df.language.loc[13] = 'JavaScript'
df.language.loc[14] = 'C'
df.language.loc[83] = 'Swift'
df.language.loc[139] = 'Swift'
df.language.loc[145] = 'Swift'
df.language.loc[149] = 'Swift'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [9]:
df.language.loc[0]

'LLVM'

In [10]:
df[df.language.isnull()]

Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents
123,123,apple/ml-transcript-translation-consistency-ra...,,# Human Ratings of Transcription/Translation C...


In [11]:
df[df.readme_contents.isnull()]

Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents
114,114,apple/darwin-libplatform,C,
135,135,apple/swift-libcxx,C++,
144,144,apple/ccs-pyopendirectory,C++,
150,150,apple/darwin-libpthread,C,


In [12]:
df.dropna(inplace = True)
df.shape

(146, 4)

In [13]:
# let's map the less common languages to 'other' and consolidate the different c's into one
# language
df.language.value_counts()

Swift               69
Python              38
C++                 13
C                   10
JavaScript           3
LLVM                 2
Java                 2
Jupyter Notebook     2
HTML                 2
Dockerfile           1
Markdown             1
Shell                1
R                    1
Starlark             1
Name: language, dtype: int64

In [14]:
df['language']= df.language.map({'Swift': 'Swift', 'Python': 'Python', 'C++':'C', 'C': 'C',
                                 'JavaScript':'Other', 'LLVM':'Other', 'Java':'Other', 
                                 'Jupyter Notebook':'Other', 'HTML':'Other', 'Dockerfile':'Other', 
                                 'Markdown':'Other', 'Shell':'Other', 'R':'Other', 'Starlark':'Other'})

In [15]:
df.language.value_counts()

Swift     69
Python    38
C         23
Other     16
Name: language, dtype: int64

In [16]:
i = 0
for readme in df.readme_contents:
    df.readme_contents.loc[i] = prepare_dr.basic_clean(readme)
    i = i + 1
    
for readme in df.readme_contents:
    df.readme_contents.loc[i] = prepare_dr.tokenize(readme)
    i = i + 1

for readme in df.readme_contents:
    df.readme_contents.loc[i] = prepare_dr.stem(readme)
    i = i + 1
    
for readme in df.readme_contents:
    df.readme_contents.loc[i] = prepare_dr.lemmatize(readme)
    i = i + 1
    
for readme in df.readme_contents:
    df.readme_contents.loc[i] = prepare_dr.remove_stopwords(readme)
    i = i + 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [17]:
df.readme_contents.loc[0]

'  apple s fork of llvm project  this is apple s fork of llvm project   for more information on apple s branching scheme  please see  apple docs applebranchingscheme md  https   github com apple llvm project tree apple main apple docs applebranchingscheme md    the llvm project s main readme follows     the llvm compiler infrastructure  this directory and its sub directories contain source code for llvm  a toolkit for the construction of highly optimized compilers  optimizers  and run time environments   the readme briefly describes how to get started with building llvm  for more information on how to contribute to the llvm project  please take a look at the  contributing to llvm  https   llvm org docs contributing html  guide      getting started with the llvm system  taken from https   llvm org docs gettingstarted html       overview  welcome to the llvm project   the llvm project has multiple components  the core of the project is itself called  llvm   this contains all of the tools

In [18]:
df.drop(columns = 'Unnamed: 0', inplace = True)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146 entries, 0 to 149
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             146 non-null    object
 1   language         146 non-null    object
 2   readme_contents  146 non-null    object
dtypes: object(3)
memory usage: 8.6+ KB


In [20]:
df.head()

Unnamed: 0,repo,language,readme_contents
0,apple/llvm-project,Other,apple s fork of llvm project this is apple ...
1,apple/swift-argument-parser,Swift,swift argument parser usage begin by de...
2,apple/swift-docc,Swift,swift docc swift docc is a documentation co...
3,apple/swift,C,img src https swift org assets images swif...
4,apple/sourcekit-lsp,Swift,sourcekit lsp sourcekit lsp is an implement...
