# Data cleaning
We want the class with labels as numerical value and the body with clean text.

This will remove:
* duplicates
* NaN entires
* non english
* url, html

* make it lowercase
* combine title and body

In [1]:
import pandas as pd
import sys
sys.path.append("../../../scripts_shared/")
from preprocess_text import preprocess_text


In [2]:
# Read CSV into a dataframe
filename = "csv/mypy_testset.csv"
df = pd.read_csv(filename)
df

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body
0,1.142706e+10,IssuesEvent,2020-02-03 23:33:27,python/mypy,https://api.github.com/repos/python/mypy,closed,Type of conditional expression is object,false-positive needs discussion priority-1-nor...,The type of the following conditional expressi...
1,1.142707e+10,IssuesEvent,2020-02-03 23:35:24,python/mypy,https://api.github.com/repos/python/mypy,closed,Type inference of Tuples returns object instead,bug false-positive priority-0-high,While interacting with `zip` I encountered a v...
2,1.362788e+10,IssuesEvent,2020-09-24 13:13:03,python/mypy,https://api.github.com/repos/python/mypy,opened,Daemon support for --follow-imports=silent,feature,"After #5870 is done, it would be nice to also ..."
3,1.363241e+10,IssuesEvent,2020-09-24 19:36:27,python/mypy,https://api.github.com/repos/python/mypy,opened,regression: assignment of 'builtins.type' now ...,bug,\r\n**Bug Report**\r\n\r\n#7963 causes a serio...
4,2.407484e+10,IssuesEvent,2022-09-18 17:03:35,python/mypy,https://api.github.com/repos/python/mypy,opened,stubtest: more concise error for forgotten arg...,feature,**Feature**\r\n\r\nWhen the implementation add...
...,...,...,...,...,...,...,...,...,...
5009,2.060497e+10,IssuesEvent,2022-03-06 20:54:33,python/mypy,https://api.github.com/repos/python/mypy,closed,Color output missing when errors redrected (wi...,bug,<!--\r\n If you're new to mypy and you're not...
5010,2.060505e+10,IssuesEvent,2022-03-06 21:09:50,python/mypy,https://api.github.com/repos/python/mypy,reopened,Color output missing when errors redrected (wi...,bug,<!--\r\n If you're new to mypy and you're not...
5011,2.060518e+10,IssuesEvent,2022-03-06 21:33:34,python/mypy,https://api.github.com/repos/python/mypy,opened,"Getting Invalid ""type: ignore"" comment reports...",bug,**Bug Report**\r\n\r\n3rd party library I am u...
5012,2.060562e+10,IssuesEvent,2022-03-06 23:01:04,python/mypy,https://api.github.com/repos/python/mypy,closed,Wrong type inference with unpacking and enumer...,bug priority-1-normal,"Mypy infers that the type of `[*enumerate([""x""..."


In [3]:
# Number or different labels
df.labels.value_counts().to_frame()[:50]

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
bug,2266
feature,443
crash,190
question,93
documentation,89
bug priority-1-normal,52
bug priority-0-high,46
crash priority-0-high,39
needs discussion,39
feature priority-1-normal,30


In [4]:
# Split data based on labels. Contains P0 and P1 in one dataframe, the rest in another

pattern = 'priority-0-high'
# Check if 'labels' contains the pattern
hp = df[df['labels'].str.contains(pattern)]
# Reset index
hp = hp.reset_index(drop=True)
hp


Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body
0,1.142707e+10,IssuesEvent,2020-02-03 23:35:24,python/mypy,https://api.github.com/repos/python/mypy,closed,Type inference of Tuples returns object instead,bug false-positive priority-0-high,While interacting with `zip` I encountered a v...
1,2.655427e+10,IssuesEvent,2023-01-20 10:36:13,python/mypy,https://api.github.com/repos/python/mypy,opened,Unbound type variable false positive with six....,bug topic-type-variables priority-0-high,This code generates a false positive:\r\n```py...
2,2.656236e+10,IssuesEvent,2023-01-20 16:54:26,python/mypy,https://api.github.com/repos/python/mypy,closed,Unbound type variable false positive with six....,bug topic-type-variables priority-0-high,This code generates a false positive:\r\n```py...
3,5.439713e+09,IssuesEvent,2017-03-06 14:10:37,python/mypy,https://api.github.com/repos/python/mypy,closed,Crash in super() outside a method,crash priority-0-high,The simplest repro:\r\n```python\r\nclass C:\r...
4,8.708969e+09,IssuesEvent,2018-12-06 12:35:33,python/mypy,https://api.github.com/repos/python/mypy,closed,TypedDict missing many dict methods,bug false-positive priority-0-high topic-typed...,mypy isn't recognizing methods of TypedDict.\r...
...,...,...,...,...,...,...,...,...,...
434,1.003922e+10,IssuesEvent,2019-07-18 16:50:07,python/mypy,https://api.github.com/repos/python/mypy,opened,Support error codes and ignoring only specific...,feature priority-0-high,It would be handy to support error codes to al...
435,2.322156e+10,IssuesEvent,2022-08-02 18:45:12,python/mypy,https://api.github.com/repos/python/mypy,closed,Should concrete implementations be required to...,bug priority-0-high topic-protocols,"* Are you reporting a bug, or opening a featur..."
436,8.837451e+09,IssuesEvent,2019-01-05 04:59:35,python/mypy,https://api.github.com/repos/python/mypy,closed,Windows dmypy CI flakes,crash priority-0-high topic-daemon,We've seen some failures of the daemon tests o...
437,8.527242e+09,IssuesEvent,2018-11-02 18:50:03,python/mypy,https://api.github.com/repos/python/mypy,closed,Add plugin API to specify fine grained depende...,priority-0-high topic-fine-grained-incremental...,Currently plugins can add type information tha...


In [5]:
# Remove pattern from df
random = df[~df['labels'].str.contains(pattern)]
random = random.reset_index(drop=True)
random

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body
0,1.142706e+10,IssuesEvent,2020-02-03 23:33:27,python/mypy,https://api.github.com/repos/python/mypy,closed,Type of conditional expression is object,false-positive needs discussion priority-1-nor...,The type of the following conditional expressi...
1,1.362788e+10,IssuesEvent,2020-09-24 13:13:03,python/mypy,https://api.github.com/repos/python/mypy,opened,Daemon support for --follow-imports=silent,feature,"After #5870 is done, it would be nice to also ..."
2,1.363241e+10,IssuesEvent,2020-09-24 19:36:27,python/mypy,https://api.github.com/repos/python/mypy,opened,regression: assignment of 'builtins.type' now ...,bug,\r\n**Bug Report**\r\n\r\n#7963 causes a serio...
3,2.407484e+10,IssuesEvent,2022-09-18 17:03:35,python/mypy,https://api.github.com/repos/python/mypy,opened,stubtest: more concise error for forgotten arg...,feature,**Feature**\r\n\r\nWhen the implementation add...
4,7.238785e+09,IssuesEvent,2018-02-13 15:37:22,python/mypy,https://api.github.com/repos/python/mypy,closed,Typechecking attrs-generated classes,feature needs discussion topic-plugins,[attrs](https://github.com/hynek/attrs) remove...
...,...,...,...,...,...,...,...,...,...
4570,2.060497e+10,IssuesEvent,2022-03-06 20:54:33,python/mypy,https://api.github.com/repos/python/mypy,closed,Color output missing when errors redrected (wi...,bug,<!--\r\n If you're new to mypy and you're not...
4571,2.060505e+10,IssuesEvent,2022-03-06 21:09:50,python/mypy,https://api.github.com/repos/python/mypy,reopened,Color output missing when errors redrected (wi...,bug,<!--\r\n If you're new to mypy and you're not...
4572,2.060518e+10,IssuesEvent,2022-03-06 21:33:34,python/mypy,https://api.github.com/repos/python/mypy,opened,"Getting Invalid ""type: ignore"" comment reports...",bug,**Bug Report**\r\n\r\n3rd party library I am u...
4573,2.060562e+10,IssuesEvent,2022-03-06 23:01:04,python/mypy,https://api.github.com/repos/python/mypy,closed,Wrong type inference with unpacking and enumer...,bug priority-1-normal,"Mypy infers that the type of `[*enumerate([""x""..."


In [6]:
random.labels.value_counts().to_frame()[:50]

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
bug,2266
feature,443
crash,190
question,93
documentation,89
bug priority-1-normal,52
needs discussion,39
feature priority-1-normal,30
bug topic-paramspec,26
bug false-positive priority-1-normal,18


In [7]:
# Number or different labels
hp.labels.value_counts().to_frame()[:50]

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
bug priority-0-high,46
crash priority-0-high,39
feature priority-0-high,22
bug false-positive priority-0-high,19
priority-0-high,16
documentation priority-0-high,11
crash new-semantic-analyzer priority-0-high,9
crash priority-0-high topic-fine-grained-incremental,9
priority-0-high topic-usability,7
feature priority-0-high topic-usability,7


In [8]:
#Give each priority a label by number.
# 'Label encoding'. Makes is easier for machine learning models to work with categorical data.
hp["label"] = 1
hp["class"] = "high_priority"
hp.head()

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,label,class
0,11427070000.0,IssuesEvent,2020-02-03 23:35:24,python/mypy,https://api.github.com/repos/python/mypy,closed,Type inference of Tuples returns object instead,bug false-positive priority-0-high,While interacting with `zip` I encountered a v...,1,high_priority
1,26554270000.0,IssuesEvent,2023-01-20 10:36:13,python/mypy,https://api.github.com/repos/python/mypy,opened,Unbound type variable false positive with six....,bug topic-type-variables priority-0-high,This code generates a false positive:\r\n```py...,1,high_priority
2,26562360000.0,IssuesEvent,2023-01-20 16:54:26,python/mypy,https://api.github.com/repos/python/mypy,closed,Unbound type variable false positive with six....,bug topic-type-variables priority-0-high,This code generates a false positive:\r\n```py...,1,high_priority
3,5439713000.0,IssuesEvent,2017-03-06 14:10:37,python/mypy,https://api.github.com/repos/python/mypy,closed,Crash in super() outside a method,crash priority-0-high,The simplest repro:\r\n```python\r\nclass C:\r...,1,high_priority
4,8708969000.0,IssuesEvent,2018-12-06 12:35:33,python/mypy,https://api.github.com/repos/python/mypy,closed,TypedDict missing many dict methods,bug false-positive priority-0-high topic-typed...,mypy isn't recognizing methods of TypedDict.\r...,1,high_priority


In [9]:
random["label"] = 0
random["class"] = "not_high_priority" 
random.head()

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,label,class
0,11427060000.0,IssuesEvent,2020-02-03 23:33:27,python/mypy,https://api.github.com/repos/python/mypy,closed,Type of conditional expression is object,false-positive needs discussion priority-1-nor...,The type of the following conditional expressi...,0,not_high_priority
1,13627880000.0,IssuesEvent,2020-09-24 13:13:03,python/mypy,https://api.github.com/repos/python/mypy,opened,Daemon support for --follow-imports=silent,feature,"After #5870 is done, it would be nice to also ...",0,not_high_priority
2,13632410000.0,IssuesEvent,2020-09-24 19:36:27,python/mypy,https://api.github.com/repos/python/mypy,opened,regression: assignment of 'builtins.type' now ...,bug,\r\n**Bug Report**\r\n\r\n#7963 causes a serio...,0,not_high_priority
3,24074840000.0,IssuesEvent,2022-09-18 17:03:35,python/mypy,https://api.github.com/repos/python/mypy,opened,stubtest: more concise error for forgotten arg...,feature,**Feature**\r\n\r\nWhen the implementation add...,0,not_high_priority
4,7238785000.0,IssuesEvent,2018-02-13 15:37:22,python/mypy,https://api.github.com/repos/python/mypy,closed,Typechecking attrs-generated classes,feature needs discussion topic-plugins,[attrs](https://github.com/hynek/attrs) remove...,0,not_high_priority


In [10]:
# Drop duplicates by the content of the title
high_priority = hp.drop_duplicates(subset=['title'], keep='last')
high_priority.dropna(inplace=True)
high_priority.reset_index(inplace=True)
high_priority.drop(columns=["index"] , inplace= True)
high_priority["class"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_priority.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_priority.drop(columns=["index"] , inplace= True)


class
high_priority    368
Name: count, dtype: int64

In [11]:
# Drop duplicates by the content of the title
not_high_priority = random.drop_duplicates(subset=['title'], keep='last')
not_high_priority.dropna(inplace=True)
not_high_priority.reset_index(inplace=True)
not_high_priority.drop(columns=["index"] , inplace= True)
not_high_priority["class"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_high_priority.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_high_priority.drop(columns=["index"] , inplace= True)


class
not_high_priority    3623
Name: count, dtype: int64

In [12]:
high_priority.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368 entries, 0 to 367
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          368 non-null    float64
 1   type        368 non-null    object 
 2   created_at  368 non-null    object 
 3   repo        368 non-null    object 
 4   repo_url    368 non-null    object 
 5   action      368 non-null    object 
 6   title       368 non-null    object 
 7   labels      368 non-null    object 
 8   body        368 non-null    object 
 9   label       368 non-null    int64  
 10  class       368 non-null    object 
dtypes: float64(1), int64(1), object(9)
memory usage: 31.8+ KB


In [13]:
label_counts = high_priority["class"].value_counts()
label_counts_nhp = not_high_priority["class"].value_counts()
print(label_counts)
not_high_priority_count = label_counts_nhp["not_high_priority"]
print(not_high_priority_count)
hp_count = label_counts["high_priority"]
hp_count

class
high_priority    368
Name: count, dtype: int64
3623


368

In [14]:
not_high_priority = not_high_priority.sample(frac=hp_count/not_high_priority_count, random_state=42)
not_high_priority

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,label,class
1703,2.361506e+10,IssuesEvent,2022-08-24 15:13:49,python/mypy,https://api.github.com/repos/python/mypy,opened,"super(cls, self) seemingly has no attribute it...",bug,**Bug Report**\r\n\r\nI wanted to have a quick...,0,not_high_priority
1417,2.948347e+10,IssuesEvent,2023-06-02 07:57:06,python/mypy,https://api.github.com/repos/python/mypy,closed,mypy cannot change the dictionary value types ...,bug,<!--\r\nIf you're not sure whether what you're...,0,not_high_priority
1074,2.108334e+10,IssuesEvent,2022-04-03 08:27:43,python/mypy,https://api.github.com/repos/python/mypy,opened,Plugin: Use mypy to enrich AST with types,feature,With the `inspect`/`ast` modules I can get an ...,0,not_high_priority
670,7.697447e+09,IssuesEvent,2018-05-18 18:48:43,python/mypy,https://api.github.com/repos/python/mypy,closed,Valid namedtuple invocation determined invalid,bug false-positive priority-1-normal topic-nam...,I recently came accross the following snippet ...,0,not_high_priority
1770,1.801168e+10,IssuesEvent,2021-09-16 09:19:18,python/mypy,https://api.github.com/repos/python/mypy,opened,Use mypy to find backward incompatibilities be...,feature,**Feature**\r\n\r\nWould it be possible to use...,0,not_high_priority
...,...,...,...,...,...,...,...,...,...,...,...
239,1.029500e+10,IssuesEvent,2019-08-27 20:07:42,python/mypy,https://api.github.com/repos/python/mypy,closed,import of constraints fails if done before sub...,priority-2-low refactoring,"* Are you reporting a bug, or opening a featur...",0,not_high_priority
3207,2.848774e+10,IssuesEvent,2023-04-18 09:01:43,python/mypy,https://api.github.com/repos/python/mypy,opened,Overloaded function implementation does not ac...,bug,**Bug Report**\r\nI am trying to remove typing...,0,not_high_priority
907,3.108059e+10,IssuesEvent,2023-08-13 02:39:09,python/mypy,https://api.github.com/repos/python/mypy,closed,no `comparison-overlap` error when using `in` ...,bug,```py\r\nfoo: list[str] = []\r\n\r\nprint(1 in...,0,not_high_priority
439,1.824428e+10,IssuesEvent,2021-10-01 16:17:07,python/mypy,https://api.github.com/repos/python/mypy,closed,mypy should support optional tagged unions,bug topic-strict-optional topic-typed-dict pri...,__Report Type:__ Bug\r\n\r\n__Example:__ https...,0,not_high_priority


In [15]:
all_priority = pd.concat([high_priority,not_high_priority] , ignore_index = True)
all_priority.tail()

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,label,class
731,10295000000.0,IssuesEvent,2019-08-27 20:07:42,python/mypy,https://api.github.com/repos/python/mypy,closed,import of constraints fails if done before sub...,priority-2-low refactoring,"* Are you reporting a bug, or opening a featur...",0,not_high_priority
732,28487740000.0,IssuesEvent,2023-04-18 09:01:43,python/mypy,https://api.github.com/repos/python/mypy,opened,Overloaded function implementation does not ac...,bug,**Bug Report**\r\nI am trying to remove typing...,0,not_high_priority
733,31080590000.0,IssuesEvent,2023-08-13 02:39:09,python/mypy,https://api.github.com/repos/python/mypy,closed,no `comparison-overlap` error when using `in` ...,bug,```py\r\nfoo: list[str] = []\r\n\r\nprint(1 in...,0,not_high_priority
734,18244280000.0,IssuesEvent,2021-10-01 16:17:07,python/mypy,https://api.github.com/repos/python/mypy,closed,mypy should support optional tagged unions,bug topic-strict-optional topic-typed-dict pri...,__Report Type:__ Bug\r\n\r\n__Example:__ https...,0,not_high_priority
735,14579170000.0,IssuesEvent,2020-12-18 06:43:43,python/mypy,https://api.github.com/repos/python/mypy,opened,"Adding a classmethod to TypedDict with ""# type...",crash,<!--\r\n Use this form only if mypy reports a...,0,not_high_priority


In [16]:
# 
all_priority["label"].value_counts()

label
1    368
0    368
Name: count, dtype: int64

In [17]:
print(all_priority["title"][0])
print(all_priority["body"][0])

Type inference of Tuples returns object instead
While interacting with `zip` I encountered a variant of the following error:
```
error: No overload variant of "zip" matches argument types [builtins.list[builtins.int*], builtins.object*]
```
which is emitted by mypy on the case of
```py
out = list(
    list(zip([1, 2], ordered_item))
    for item in [(1, 2), (4, 5)]
    for ordered_item in [item, tuple(reversed(item))]
    )
print(out)
```
This code runs normally on python 3.6.5, outputting:
```
[[(1, 1), (2, 2)], [(1, 2), (2, 1)], [(1, 4), (2, 5)], [(1, 5), (2, 4)]]
```

## Expected Behavior
Mypy should infer that `ordered_item` is a tuple, or at least an iterable so that `zip` can be applied on it.

It may also be interesting to consider the case where `ordered_item` is heterogenous over different `Iterables` (i.e. not only tuples). Would Iterable be inferred then?

## Actual Behavior
Mypy infers the `object` type on `ordered_item` which you understandably cannot zip.

By explicitly c

In [18]:
# Copy content of body to a new col named text
all_priority["text"] = all_priority["title"] + " " +all_priority["body"]
all_priority.tail()

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,label,class,text
731,10295000000.0,IssuesEvent,2019-08-27 20:07:42,python/mypy,https://api.github.com/repos/python/mypy,closed,import of constraints fails if done before sub...,priority-2-low refactoring,"* Are you reporting a bug, or opening a featur...",0,not_high_priority,import of constraints fails if done before sub...
732,28487740000.0,IssuesEvent,2023-04-18 09:01:43,python/mypy,https://api.github.com/repos/python/mypy,opened,Overloaded function implementation does not ac...,bug,**Bug Report**\r\nI am trying to remove typing...,0,not_high_priority,Overloaded function implementation does not ac...
733,31080590000.0,IssuesEvent,2023-08-13 02:39:09,python/mypy,https://api.github.com/repos/python/mypy,closed,no `comparison-overlap` error when using `in` ...,bug,```py\r\nfoo: list[str] = []\r\n\r\nprint(1 in...,0,not_high_priority,no `comparison-overlap` error when using `in` ...
734,18244280000.0,IssuesEvent,2021-10-01 16:17:07,python/mypy,https://api.github.com/repos/python/mypy,closed,mypy should support optional tagged unions,bug topic-strict-optional topic-typed-dict pri...,__Report Type:__ Bug\r\n\r\n__Example:__ https...,0,not_high_priority,mypy should support optional tagged unions __R...
735,14579170000.0,IssuesEvent,2020-12-18 06:43:43,python/mypy,https://api.github.com/repos/python/mypy,opened,"Adding a classmethod to TypedDict with ""# type...",crash,<!--\r\n Use this form only if mypy reports a...,0,not_high_priority,"Adding a classmethod to TypedDict with ""# type..."


In [19]:
all_priority["text"][0]

'Type inference of Tuples returns object instead While interacting with `zip` I encountered a variant of the following error:\r\n```\r\nerror: No overload variant of "zip" matches argument types [builtins.list[builtins.int*], builtins.object*]\r\n```\r\nwhich is emitted by mypy on the case of\r\n```py\r\nout = list(\r\n    list(zip([1, 2], ordered_item))\r\n    for item in [(1, 2), (4, 5)]\r\n    for ordered_item in [item, tuple(reversed(item))]\r\n    )\r\nprint(out)\r\n```\r\nThis code runs normally on python 3.6.5, outputting:\r\n```\r\n[[(1, 1), (2, 2)], [(1, 2), (2, 1)], [(1, 4), (2, 5)], [(1, 5), (2, 4)]]\r\n```\r\n\r\n## Expected Behavior\r\nMypy should infer that `ordered_item` is a tuple, or at least an iterable so that `zip` can be applied on it.\r\n\r\nIt may also be interesting to consider the case where `ordered_item` is heterogenous over different `Iterables` (i.e. not only tuples). Would Iterable be inferred then?\r\n\r\n## Actual Behavior\r\nMypy infers the `object` typ

In [20]:
# Make a new dataframe with only text, label and class cols.
all_priority_subset = all_priority[["text" , "label" , "class"]]
all_priority_subset

Unnamed: 0,text,label,class
0,Type inference of Tuples returns object instea...,1,high_priority
1,Unbound type variable false positive with six....,1,high_priority
2,Crash in super() outside a method The simplest...,1,high_priority
3,TypedDict missing many dict methods mypy isn't...,1,high_priority
4,TypedDict and del Mypy disallows `del td[key]`...,1,high_priority
...,...,...,...
731,import of constraints fails if done before sub...,0,not_high_priority
732,Overloaded function implementation does not ac...,0,not_high_priority
733,no `comparison-overlap` error when using `in` ...,0,not_high_priority
734,mypy should support optional tagged unions __R...,0,not_high_priority


In [21]:
# Convert to string
all_priority_subset["text_str"] = all_priority_subset['text'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_priority_subset["text_str"] = all_priority_subset['text'].astype(str)


In [22]:
all_priority_subset

Unnamed: 0,text,label,class,text_str
0,Type inference of Tuples returns object instea...,1,high_priority,Type inference of Tuples returns object instea...
1,Unbound type variable false positive with six....,1,high_priority,Unbound type variable false positive with six....
2,Crash in super() outside a method The simplest...,1,high_priority,Crash in super() outside a method The simplest...
3,TypedDict missing many dict methods mypy isn't...,1,high_priority,TypedDict missing many dict methods mypy isn't...
4,TypedDict and del Mypy disallows `del td[key]`...,1,high_priority,TypedDict and del Mypy disallows `del td[key]`...
...,...,...,...,...
731,import of constraints fails if done before sub...,0,not_high_priority,import of constraints fails if done before sub...
732,Overloaded function implementation does not ac...,0,not_high_priority,Overloaded function implementation does not ac...
733,no `comparison-overlap` error when using `in` ...,0,not_high_priority,no `comparison-overlap` error when using `in` ...
734,mypy should support optional tagged unions __R...,0,not_high_priority,mypy should support optional tagged unions __R...


In [23]:
# Clean the data.
all_priority_subset["text_clean"] = all_priority_subset["text_str"].map(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_priority_subset["text_clean"] = all_priority_subset["text_str"].map(preprocess_text)


In [24]:
# Make a subset with text_clean and label
priority_label_text = all_priority_subset[["text_clean" , "label"]]
priority_label_text

Unnamed: 0,text_clean,label
0,type inference of tuples returns object instea...,1
1,unbound type variable false positive with sixw...,1
2,crash in super outside a method the simplest r...,1
3,typeddict missing many dict methods mypy isnt ...,1
4,typeddict and del mypy disallows del td for ty...,1
...,...,...
731,import of constraints fails if done before sub...,0
732,overloaded function implementation does not ac...,0
733,no comparisonoverlap error when using in opera...,0
734,mypy should support optional tagged unions rep...,0


In [25]:
# Need to dropna here since cleaning function returns NaN for not english text.
priority_label_text.dropna(inplace=True)
priority_label_text.reset_index(inplace=True)
priority_label_text.drop(columns=["index"] , inplace= True)

priority_label_text

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority_label_text.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority_label_text.drop(columns=["index"] , inplace= True)


Unnamed: 0,text_clean,label
0,type inference of tuples returns object instea...,1
1,unbound type variable false positive with sixw...,1
2,crash in super outside a method the simplest r...,1
3,typeddict missing many dict methods mypy isnt ...,1
4,typeddict and del mypy disallows del td for ty...,1
...,...,...
730,import of constraints fails if done before sub...,0
731,overloaded function implementation does not ac...,0
732,no comparisonoverlap error when using in opera...,0
733,mypy should support optional tagged unions rep...,0


In [26]:
from sklearn.utils import resample

# Split the dataset into two based on the label
df_majority = priority_label_text[priority_label_text['label'] == 0]
df_minority = priority_label_text[priority_label_text['label'] == 1]

# Undersample the majority class
df_majority_undersampled = resample(df_majority, 
                                     replace=False,    # sample without replacement
                                     n_samples=len(df_minority),     # to match minority class
                                     random_state=123) # reproducible results

# Combine minority class with downsampled majority class
df_balanced = pd.concat([df_majority_undersampled, df_minority])

# Shuffle the dataset to avoid any ordering bias
df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)
priority_label_text = df_balanced
priority_label_text["label"].value_counts()

label
1    367
0    367
Name: count, dtype: int64

In [27]:
# Clean dataset with clean text and labels.
# 0 = high priority, 1 = not high priority
file_name = f"csv/clean_mypy_testset.csv"
priority_label_text.to_csv(file_name, index=False)

In [28]:
pri = pd.read_csv(file_name)
pri

Unnamed: 0,text_clean,label
0,error code misc and build errors it probably s...,1
1,no error when setter has too many arguments py...,0
2,warn about always truefalse isinstance tests t...,1
3,several crashes on pyright test suite the cras...,0
4,assert file not in selfflushedfiles without my...,1
...,...,...
729,segfault on recursive type im running mypy on ...,1
730,plugin use mypy to enrich ast with types with ...,0
731,give better error messages when calling overlo...,1
732,incompatible assignment using typeddict in gen...,0
