# Imports and global variables

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
DATA_FOLDER = "Data"

# Data cleanup and transformation

In [3]:
df = pd.read_csv(os.path.join(DATA_FOLDER, "corefx-issues-train.tsv"), sep="\t")

In [4]:
df.head()

Unnamed: 0,ID,Area,Title,Description
0,29338,area-System.Net,Include fragment and query in Uri.LocalPath on...,"While testing XmlUriResolver, @pjanotti discov..."
1,29337,area-System.Net,Unify setting null CookieContainer behavior on...,For HttpClientHandler layer (above the WinHttp...
2,29334,area-System.Net,Check URI scheme length only after verifying t...,URI construction is failing on valid URIs unde...
3,29331,area-Infrastructure,"[Perf] Ubuntu16.04 runs blocked by multiple ""P...",[perf_ubuntu16.04_release](https://ci2.dot.net...
4,29329,area-System.ComponentModel,Port System.ComponentModel.Composition.Registr...,"Greetings, regarding [Port System.Component..."


## Remove nan values

In [5]:
df = df.replace(np.nan, '', regex=True)

In [6]:
df[df.isnull().any(axis=1)]

Unnamed: 0,ID,Area,Title,Description


## Transform label values to numbers

In [7]:
df.Area.unique()

array(['area-System.Net', 'area-Infrastructure',
       'area-System.ComponentModel', 'area-System.Security',
       'area-System.Runtime', 'area-System.IO', 'area-System.Xml',
       'area-System.Collections', 'area-System.Threading',
       'area-System.Reflection', 'area-System.Memory',
       'area-System.Diagnostics', 'area-Serialization',
       'area-System.Drawing', 'area-Meta', 'area-System.Data',
       'area-Microsoft.CSharp', 'area-System.Numerics',
       'area-System.Text', 'area-System.Globalization',
       'area-System.Linq', 'area-System.Console'], dtype=object)

In [8]:
# Create a dictionary to easily go between label and its corresponding numeral value
lookup = {}
for i, area in enumerate(df.Area.unique()):
    lookup[area] = i
    lookup[i] = area

In [9]:
lookup

{'area-System.Net': 0,
 0: 'area-System.Net',
 'area-Infrastructure': 1,
 1: 'area-Infrastructure',
 'area-System.ComponentModel': 2,
 2: 'area-System.ComponentModel',
 'area-System.Security': 3,
 3: 'area-System.Security',
 'area-System.Runtime': 4,
 4: 'area-System.Runtime',
 'area-System.IO': 5,
 5: 'area-System.IO',
 'area-System.Xml': 6,
 6: 'area-System.Xml',
 'area-System.Collections': 7,
 7: 'area-System.Collections',
 'area-System.Threading': 8,
 8: 'area-System.Threading',
 'area-System.Reflection': 9,
 9: 'area-System.Reflection',
 'area-System.Memory': 10,
 10: 'area-System.Memory',
 'area-System.Diagnostics': 11,
 11: 'area-System.Diagnostics',
 'area-Serialization': 12,
 12: 'area-Serialization',
 'area-System.Drawing': 13,
 13: 'area-System.Drawing',
 'area-Meta': 14,
 14: 'area-Meta',
 'area-System.Data': 15,
 15: 'area-System.Data',
 'area-Microsoft.CSharp': 16,
 16: 'area-Microsoft.CSharp',
 'area-System.Numerics': 17,
 17: 'area-System.Numerics',
 'area-System.Text':

In [10]:
# transform the Area column to be numeric using the lookup dict created above
df.Area = df.Area.apply(lambda x: lookup[x])

In [11]:
df.Area

0        0
1        0
2        0
3        1
4        2
        ..
1610     4
1611     0
1612    17
1613     4
1614     5
Name: Area, Length: 1615, dtype: int64

In [12]:
labels = df.Area
titles = df.Title
descriptions = df.Description

## Transform data

In [13]:
# Create a text column that will be the concatenation of the title and description columns
text = []
for title, description in zip(titles, descriptions):
    text.append(" ".join(title.split()) + " " + " ".join(description.split()))

In [14]:
len(labels)

1615

In [15]:
len(text)

1615

In [16]:
text[:5]

["Include fragment and query in Uri.LocalPath on Unix While testing XmlUriResolver, @pjanotti discovered that any segments of a file path following a '#' symbol will be cut out of Uri.LocalPath on Unix. Based on additional tests, this also occurs for the '?' symbol. This is happening because the Unix specific case for local path only uses the path component of the URI: https://github.com/dotnet/corefx/blob/9e8d443ff78c4f0a9a6bedf7f95961cf96ceff0a/src/System.Private.Uri/src/System/Uri.cs#L1032-L1037 The fix here is to include the fragment and query in LocalPath in the Unix path specific case. This PR enables the test case in XmlUriResolver that uncovered this issues, and adds some additional cases to our URI tests. Fixes: #28486",
 "Unify setting null CookieContainer behavior on HttpClientHandler For HttpClientHandler layer (above the WinHttpHandler layer on Windows), we should be consistent and throw the exception in the CookieContainer setter when null value is provided, to match .NET

In [17]:
labels[:5]

0    0
1    0
2    0
3    1
4    2
Name: Area, dtype: int64

In [18]:
[lookup[x] for x in labels[:5]]

['area-System.Net',
 'area-System.Net',
 'area-System.Net',
 'area-Infrastructure',
 'area-System.ComponentModel']

## Create new DataFrame with transformed data

In [19]:
df = pd.DataFrame(list(zip(text, labels)), columns =['Text', 'Label'])

In [20]:
df.head()

Unnamed: 0,Text,Label
0,Include fragment and query in Uri.LocalPath on...,0
1,Unify setting null CookieContainer behavior on...,0
2,Check URI scheme length only after verifying t...,0
3,"[Perf] Ubuntu16.04 runs blocked by multiple ""P...",1
4,Port System.ComponentModel.Composition.Registr...,2


In [21]:
# Split into traning and validation sets, use stratify to have same distribution of data in both sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=labels)

In [22]:
print(len(train_df))
print(len(val_df))

1292
323


In [23]:
# Save the CSVs
train_df.to_csv(os.path.join(DATA_FOLDER, "corefx_cleaned_train.csv"), index=False)
val_df.to_csv(os.path.join(DATA_FOLDER, "corefx_cleaned_val.csv"), index=False)

In [24]:
# Save the lookup dict
import json

with open(os.path.join(DATA_FOLDER, 'lookup.json'), 'w') as f:
    json.dump(lookup, f, indent=4)