In [41]:
import pandas as pd
import numpy as np
import sklearn
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

In [42]:
# some params

MIN_FILE_WORDS = 50
MIN_SUBMISSIONS_PER_USER = 10
NUMBER_OF_USERS = 100

In [43]:
# download training dataset: https://github.com/Jur1cek/gcj-dataset/raw/master/gcj2020.csv.tar.bz2 do not forget to unbzip2

In [44]:
df = pd.read_csv('data/gcj2020.csv', dtype=str)

# Just Python 3
df_count = df.loc[df['full_path'].str.match(".*\.PYTHON3$", case=False)]
df_count = df_count.drop_duplicates(subset='flines')

# minimum of 50 WORDS
df_count['counts'] = [len(str(x).split()) for x in df_count['flines']]
df_count = df_count.loc[df_count['counts'] >= MIN_FILE_WORDS] 

# minimum of 10 submissions
df_count = df_count.groupby('username').filter(lambda x: x['username'].count() >= MIN_SUBMISSIONS_PER_USER)

# random 10 submissions per user
df_count_all = df_count.groupby('username').apply(lambda x: x.sample(MIN_SUBMISSIONS_PER_USER))


# pick random NUMBER_OF_USERS users
df_count_all=df_count_all.reset_index(drop=True)
g=df_count_all.groupby('username')
a=np.arange(g.ngroups)
np.random.shuffle(a)
df_count_all=df_count_all[g.ngroup().isin(a[:NUMBER_OF_USERS])]

In [45]:
X = np.asarray(df_count_all['flines'].tolist())
y = np.asarray(df_count_all['username'].tolist())

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, stratify=y)

# be careful of TF-IDF memory usage

In [47]:
vectorizer = TfidfVectorizer(max_features=5000, strip_accents='unicode', use_idf=True, lowercase=True, ngram_range=(1,4))
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

In [48]:
model = RandomForestClassifier(n_estimators=100)

model.fit(train_vectors, y_train)

pred = model.predict(test_vectors)

In [49]:
sklearn.metrics.accuracy_score(y_test, pred)

0.94

In [50]:
# complete list of authors
np.unique(y)

array(['007brendan@gmail.com', 'AQForty', 'AndrewMathias', 'Ashwa462',
       'Aurel', 'Ayush1290', 'BargaviC', 'Beurtschipper', 'Bilisel',
       'C_BK', 'CoCoMong', 'Emily1616', 'HotCurry', 'HydraulicSheep',
       'Jacky_Zheng', 'Jahnavi_M', 'Jeck5ivk', 'Kandhum', 'MakiY',
       'Mammouth', 'Marc974', 'MasterBlaster', 'Maw1a_', 'Mitsuo',
       'MizhaelZeyuChen', "Mo'menMohHamdy", 'Monir786', 'Morne',
       'Nandini2000', 'NeutrinoAnt', 'NielsW', 'Omkar3103', 'Poneface',
       'Rohit1905', 'RutujaRane', 'Saketkr21', 'Seb314', 'StormSpirit',
       'Tecnosam', 'TheBoaredProgrammer', 'Tr3cks', 'Volifter',
       'ZulqarnainAbbas', 'ZytchiX', 'abhirao', 'achyutha',
       'alvinj.delacruz@gmail.com', 'bHaSkAr', 'benguin', 'carlossless',
       'cillinzhang', 'coffeered', 'daVinci', 'deep.sarkar', 'dias17',
       'dorime', 'drakeire', 'erencantemur', 'georgpap@isi.edu',
       'ishikashah2510', 'jbrummer402', 'jedrzejmikolajczyk',
       'jeholliday', 'joaobernardo01', 'kappahouse',

In [51]:
# Train print:
print("*Author:*", y_train[0])
print(X_train[0])

*Author:* Seb314
t=int(input())
soluce=""

for i in range (0,t):
  line = input()
  start_x=int(line.split(" ")[0])
  start_y=int(line.split(" ")[1])
  new_y = start_y
  moves = list(line.split(" ")[2])
  soluce = ""

  #aller au bon X
  for k in range (0,start_x):
    if len(moves) > 0:
      move = moves.pop(0)
      if move == "N":
        new_y = new_y + 1
      else:
        new_y = new_y - 1
    else:
      soluce = "IMPOSSIBLE"
      break
  
  if soluce == "IMPOSSIBLE":
    a=1
  else:
    if new_y == 0 :
      soluce = start_x
    else:
      for j in range(0,len(moves)):
        move = moves[j]
        if move == "N":
          new_y = new_y + 1
        else:
          new_y = new_y - 1
        
        if j >= abs(new_y):
          soluce = str(start_x + j+1)
          break

  if soluce == "":
    soluce = "IMPOSSIBLE"

  print("Case #" + str(i+1) + ": " + str(soluce))







In [52]:
# Test print:
print("*Author:*", y_test[0])
print(X_test[0])

*Author:* x1Mike7x
DEBUG = 0


def print_pascal(n=10):
    p = [[1], [1, 1]]
    for i in range(2, n + 1):
        a = [1]
        for j in range(1, i + 1):
            k = p[-1][j - 1]
            if j < len(p[-1]):
                k += p[-1][j]
            a.append(k)
        p.append(a)
    for line in p:
        print(str(sum(line)), '  |  ', ' '.join(map(str, line)))


def get_tests(use_file):
    if use_file:
        tests = []
        with open('test.txt', 'r') as ifs:
            lines = ifs.readlines()
            n = int(lines[0])
            return list(map(int, lines[1:n+1]))
    else:
        T = int(input())
        tests = []
        for _ in range(T):
            n = int(input())
            tests.append(n)
    return tests


def solution(n):
    K = 42
    if n < K + 5:
        res = [(i, 1) for i in range(1, n + 1)]
        return '\n'.join('%d %d' % (r, k) for r, k in res)
    
    x = n - K
    v = []
    b = 1
    while x > 0:
        if x & 1:
            v.append

In [53]:
# Save

with open('model.pickle', 'wb') as handle:
    pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('vectorizer.pickle', 'wb') as handle:
    pickle.dump(vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# X_train, X_test, y_train, y_test
with open('X_train.pickle', 'wb') as handle:
    pickle.dump(X_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('X_test.pickle', 'wb') as handle:
    pickle.dump(X_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('y_train.pickle', 'wb') as handle:
    pickle.dump(y_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('y_test.pickle', 'wb') as handle:
    pickle.dump(y_test, handle, protocol=pickle.HIGHEST_PROTOCOL)